diff options
Diffstat (limited to 'net')
168 files changed, 2652 insertions, 1648 deletions
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 361116f77cb9..f75816f58107 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -106,3 +106,14 @@ config BATMAN_ADV_DEBUG say N here. This enables compilation of support for outputting debugging information to the kernel log. The output is controlled via the module parameter debug. + +config BATMAN_ADV_TRACING + bool "B.A.T.M.A.N. tracing support" + depends on BATMAN_ADV + depends on EVENT_TRACING + help + This is an option for use by developers; most people should + say N here. Select this option to gather traces like the debug + messages using the generic tracing infrastructure of the kernel. + BATMAN_ADV_DEBUG must also be selected to get trace events for + batadv_dbg. diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index b97ba6fb8353..9b58160fe485 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -42,6 +42,9 @@ batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o +batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o batman-adv-y += tp_meter.o batman-adv-y += translation-table.o batman-adv-y += tvlv.o + +CFLAGS_trace.o := -I$(src) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 73bf6a93a3cf..d2227091029f 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -138,169 +138,6 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) } /** - * batadv_iv_ogm_orig_free() - free the private resources allocated for this - * orig_node - * @orig_node: the orig_node for which the resources have to be free'd - */ -static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) -{ - kfree(orig_node->bat_iv.bcast_own); - kfree(orig_node->bat_iv.bcast_own_sum); -} - -/** - * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node - * to include the new hard-interface - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * - * Return: 0 on success, a negative error code otherwise. - */ -static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, - unsigned int max_if_num) -{ - void *data_ptr; - size_t old_size; - int ret = -ENOMEM; - - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc_array(max_if_num, - BATADV_NUM_WORDS * sizeof(unsigned long), - GFP_ATOMIC); - if (!data_ptr) - goto unlock; - - memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size); - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; - - data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) - goto unlock; - - memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - (max_if_num - 1) * sizeof(u8)); - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; - - ret = 0; - -unlock: - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - return ret; -} - -/** - * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - */ -static void -batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - size_t chunk_size; - size_t if_offset; - void *data_ptr; - - lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); - - chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC); - if (!data_ptr) - /* use old buffer when new one could not be allocated */ - data_ptr = orig_node->bat_iv.bcast_own; - - /* copy first part */ - memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); - - /* copy second part */ - if_offset = (del_if_num + 1) * chunk_size; - memmove((char *)data_ptr + del_if_num * chunk_size, - (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, - (max_if_num - del_if_num) * chunk_size); - - /* bcast_own was shrunk down in new buffer; free old one */ - if (orig_node->bat_iv.bcast_own != data_ptr) { - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; - } -} - -/** - * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - */ -static void -batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - size_t if_offset; - void *data_ptr; - - lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); - - data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) - /* use old buffer when new one could not be allocated */ - data_ptr = orig_node->bat_iv.bcast_own_sum; - - memmove(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(u8)); - - if_offset = (del_if_num + 1) * sizeof(u8); - memmove((char *)data_ptr + del_if_num * sizeof(u8), - orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(u8)); - - /* bcast_own_sum was shrunk down in new buffer; free old one */ - if (orig_node->bat_iv.bcast_own_sum != data_ptr) { - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; - } -} - -/** - * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node - * to exclude the removed interface - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - * - * Return: 0 on success, a negative error code otherwise. - */ -static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - if (max_if_num == 0) { - kfree(orig_node->bat_iv.bcast_own); - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own = NULL; - orig_node->bat_iv.bcast_own_sum = NULL; - } else { - batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num, - del_if_num); - batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num, - del_if_num); - } - - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - return 0; -} - -/** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator * @bat_priv: the bat priv with all the soft interface information @@ -315,7 +152,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; - size_t size; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) @@ -327,16 +163,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); - size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS; - orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC); - if (!orig_node->bat_iv.bcast_own) - goto free_orig_node; - - size = bat_priv->num_ifaces * sizeof(u8); - orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); - if (!orig_node->bat_iv.bcast_own_sum) - goto free_orig_node; - kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, @@ -347,8 +173,9 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) return orig_node; free_orig_node_hash: + /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); -free_orig_node: + /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; @@ -893,26 +720,30 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; + struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; - size_t word_index; u8 *w; - unsigned int if_num; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - word_index = hard_iface->if_num * BATADV_NUM_WORDS; - word = &orig_node->bat_iv.bcast_own[word_index]; - - batadv_bit_get_packet(bat_priv, word, 1, 0); - if_num = hard_iface->if_num; - w = &orig_node->bat_iv.bcast_own_sum[if_num]; - *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + hlist_for_each_entry_rcu(orig_ifinfo, + &orig_node->ifinfo_list, + list) { + if (orig_ifinfo->if_outgoing != hard_iface) + continue; + + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + word = orig_ifinfo->bat_iv.bcast_own; + batadv_bit_get_packet(bat_priv, word, 1, 0); + w = &orig_ifinfo->bat_iv.bcast_own_sum; + *w = bitmap_weight(word, + BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + } } rcu_read_unlock(); } @@ -1000,6 +831,35 @@ out: } /** + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * @orig_node: originator which reproadcasted the OGMs directly + * @if_outgoing: interface which transmitted the original OGM and received the + * direct rebroadcast + * + * Return: Number of replied (rebroadcasted) OGMs which were transmitted by + * an originator and directly (without intermediate hop) received by a specific + * interface + */ +static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + u8 sum; + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); + if (!orig_ifinfo) + return 0; + + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + sum = orig_ifinfo->bat_iv.bcast_own_sum; + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + batadv_orig_ifinfo_put(orig_ifinfo); + + return sum; +} + +/** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the soft interface information @@ -1026,8 +886,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; - struct batadv_orig_node *orig_node_tmp; - unsigned int if_num; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; @@ -1132,18 +990,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - orig_node_tmp = router->orig_node; - spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - if_num = router->if_incoming->if_num; - sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num]; - spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - - orig_node_tmp = neigh_node->orig_node; - spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - if_num = neigh_node->if_incoming->if_num; - sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num]; - spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - + sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, + router->if_incoming); + sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, + neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } @@ -1186,7 +1036,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - unsigned int if_num; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; unsigned int tq_iface_penalty; @@ -1227,9 +1076,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - if_num = if_incoming->if_num; - orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; + orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; @@ -1237,7 +1084,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) @@ -1622,6 +1468,49 @@ out: } /** + * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it + * @ogm_packet: rebroadcast OGM packet to process + * @if_incoming: the interface where this packet was received + * @orig_node: originator which reproadcasted the OGMs + * @if_incoming_seqno: OGM sequence number when rebroadcast was received + */ +static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, + struct batadv_hard_iface *if_incoming, + struct batadv_orig_node *orig_node, + u32 if_incoming_seqno) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + s32 bit_pos; + u8 *weight; + + /* neighbor has to indicate direct link and it has to + * come via the corresponding interface + */ + if (!(ogm_packet->flags & BATADV_DIRECTLINK)) + return; + + if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, + ogm_packet->orig)) + return; + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); + if (!orig_ifinfo) + return; + + /* save packet seqno for bidirectional check */ + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + bit_pos = if_incoming_seqno - 2; + bit_pos -= ntohl(ogm_packet->seqno); + batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); + weight = &orig_ifinfo->bat_iv.bcast_own_sum; + *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, + BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + batadv_orig_ifinfo_put(orig_ifinfo); +} + +/** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) @@ -1705,37 +1594,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, } if (is_my_orig) { - unsigned long *word; - size_t offset; - s32 bit_pos; - unsigned int if_num; - u8 *weight; - orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; - /* neighbor has to indicate direct link and it has to - * come via the corresponding interface - * save packet seqno for bidirectional check - */ - if (has_directlink_flag && - batadv_compare_eth(if_incoming->net_dev->dev_addr, - ogm_packet->orig)) { - if_num = if_incoming->if_num; - offset = if_num * BATADV_NUM_WORDS; - - spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - word = &orig_neigh_node->bat_iv.bcast_own[offset]; - bit_pos = if_incoming_seqno - 2; - bit_pos -= ntohl(ogm_packet->seqno); - batadv_set_bit(word, bit_pos); - weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num]; - *weight = bitmap_weight(word, - BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - } + batadv_iv_ogm_process_reply(ogm_packet, if_incoming, + orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); @@ -2844,9 +2709,6 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .print = batadv_iv_ogm_orig_print, #endif .dump = batadv_iv_ogm_orig_dump, - .free = batadv_iv_ogm_orig_free, - .add_if = batadv_iv_ogm_orig_add_if, - .del_if = batadv_iv_ogm_orig_del_if, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 71c20c1d4002..9f481cfdf77d 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -241,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * the packet to be exactly of that size to make the link * throughput estimation effective. */ - skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len); + skb_put_zero(skb, probe_len - hard_iface->bat_v.elp_skb->len); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending unicast (probe) ELP packet on interface %s to %pM\n", @@ -268,6 +268,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; + bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -329,8 +330,11 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) * may sleep and that is not allowed in an rcu protected * context. Therefore schedule a task for that. */ - queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); + ret = queue_work(batadv_event_workqueue, + &hardif_neigh->bat_v.metric_work); + + if (!ret) + batadv_hardif_neigh_put(hardif_neigh); } rcu_read_unlock(); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ff9659af6b91..5f1aeeded0e3 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1772,6 +1772,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; + bool ret; ethhdr = eth_hdr(skb); @@ -1795,8 +1796,13 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, if (unlikely(!backbone_gw)) return true; - queue_work(batadv_event_workqueue, &backbone_gw->report_work); - /* backbone_gw is unreferenced in the report work function function */ + ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); + + /* backbone_gw is unreferenced in the report work function function + * if queue_work() call was successful + */ + if (!ret) + batadv_backbone_gw_put(backbone_gw); return true; } diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 3cb82378300b..8b608a2e2653 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -47,8 +47,24 @@ static struct dentry *batadv_debugfs; +/** + * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access + * @file: file which was accessed + * @alt: explanation what can be used as alternative + */ +void batadv_debugfs_deprecated(struct file *file, const char *alt) +{ + struct dentry *dentry = file_dentry(file); + const char *name = dentry->d_name.name; + + pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s", + current->comm, task_pid_nr(current), name, alt); +} + static int batadv_algorithms_open(struct inode *inode, struct file *file) { + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n"); return single_open(file, batadv_algo_seq_print_text, NULL); } @@ -56,6 +72,8 @@ static int neighbors_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n"); return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev); } @@ -63,6 +81,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n"); return single_open(file, batadv_orig_seq_print_text, net_dev); } @@ -79,6 +99,8 @@ static int batadv_originators_hardif_open(struct inode *inode, { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_HARDIFS instead\n"); return single_open(file, batadv_orig_hardif_seq_print_text, net_dev); } @@ -86,6 +108,8 @@ static int batadv_gateways_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_GATEWAYS instead\n"); return single_open(file, batadv_gw_client_seq_print_text, net_dev); } @@ -93,6 +117,8 @@ static int batadv_transtable_global_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n"); return single_open(file, batadv_tt_global_seq_print_text, net_dev); } @@ -101,6 +127,8 @@ static int batadv_bla_claim_table_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n"); return single_open(file, batadv_bla_claim_table_seq_print_text, net_dev); } @@ -110,6 +138,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode, { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n"); return single_open(file, batadv_bla_backbone_table_seq_print_text, net_dev); } @@ -128,6 +158,8 @@ static int batadv_dat_cache_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n"); return single_open(file, batadv_dat_cache_seq_print_text, net_dev); } #endif @@ -136,6 +168,8 @@ static int batadv_transtable_local_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n"); return single_open(file, batadv_tt_local_seq_print_text, net_dev); } @@ -149,6 +183,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, ""); return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); } #endif @@ -165,6 +200,8 @@ static int batadv_mcast_flags_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n"); return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); } #endif diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 08a592ffbee5..8de018e5c577 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -21,12 +21,14 @@ #include "main.h" +struct file; struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" #if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) +void batadv_debugfs_deprecated(struct file *file, const char *alt); void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); @@ -38,6 +40,10 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #else +static inline void batadv_debugfs_deprecated(struct file *file, const char *alt) +{ +} + static inline void batadv_debugfs_init(void) { } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 8b198ee798c9..140c61a3f1ec 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> @@ -348,6 +349,9 @@ out: * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information + * + * Has to be called with the appropriate locks being acquired + * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -355,6 +359,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node; + lockdep_assert_held(&bat_priv->gw.list_lock); + if (gateway->bandwidth_down == 0) return; @@ -369,10 +375,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - spin_lock_bh(&bat_priv->gw.list_lock); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); - spin_unlock_bh(&bat_priv->gw.list_lock); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -428,11 +432,14 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node, *curr_gw = NULL; + spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); + spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } + spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 2f0d42f2f913..781c5b6e6e8e 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -763,11 +763,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); - if (bat_priv->num_ifaces >= UINT_MAX) { - ret = -ENOSPC; - goto err_dev; - } - ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) @@ -777,16 +772,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (ret < 0) goto err_upper; - hard_iface->if_num = bat_priv->num_ifaces; - bat_priv->num_ifaces++; hard_iface->if_status = BATADV_IF_INACTIVE; - ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces); - if (ret < 0) { - bat_priv->algo_ops->iface.disable(hard_iface); - bat_priv->num_ifaces--; - hard_iface->if_status = BATADV_IF_NOT_IN_USE; - goto err_upper; - } kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; @@ -834,6 +820,33 @@ err: } /** + * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface + * @soft_iface: soft interface to check + * + * This function is only using RCU for locking - the result can therefore be + * off when another functions is modifying the list at the same time. The + * caller can use the rtnl_lock to make sure that the count is accurate. + * + * Return: number of connected/enslaved hard interfaces + */ +static size_t batadv_hardif_cnt(const struct net_device *soft_iface) +{ + struct batadv_hard_iface *hard_iface; + size_t count = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + count++; + } + rcu_read_unlock(); + + return count; +} + +/** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed * @autodel: whether to delete soft interface when it doesn't contain any other @@ -855,9 +868,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); - bat_priv->num_ifaces--; - batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces); - primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; @@ -881,7 +891,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (bat_priv->num_ifaces == 0) { + if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) { batadv_gw_check_client_stop(bat_priv); if (autodel == BATADV_IF_CLEANUP_AUTO) @@ -917,7 +927,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (ret) goto free_if; - hard_iface->if_num = 0; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 55c358ad3331..d70f363c52ae 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -47,6 +47,7 @@ #include <linux/wait.h> #include <uapi/linux/batadv_packet.h> +#include "debugfs.h" #include "hard-interface.h" #include "log.h" #include "originator.h" @@ -74,6 +75,8 @@ static int batadv_socket_open(struct inode *inode, struct file *file) if (!try_module_get(THIS_MODULE)) return -EBUSY; + batadv_debugfs_deprecated(file, ""); + nonseekable_open(inode, file); socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL); diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 853773e45f79..6beb5f067810 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -40,6 +40,9 @@ #include <linux/wait.h> #include <stdarg.h> +#include "debugfs.h" +#include "trace.h" + #define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; @@ -98,13 +101,19 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, */ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) { + struct va_format vaf; va_list args; - char tmp_log_buf[256]; va_start(args, fmt); - vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", - jiffies_to_msecs(jiffies), tmp_log_buf); + + vaf.fmt = fmt; + vaf.va = &args; + + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", + jiffies_to_msecs(jiffies), &vaf); + + trace_batadv_dbg(bat_priv, &vaf); + va_end(args); return 0; @@ -115,6 +124,9 @@ static int batadv_log_open(struct inode *inode, struct file *file) if (!try_module_get(THIS_MODULE)) return -EBUSY; + batadv_debugfs_deprecated(file, + "Use tracepoint batadv:batadv_dbg instead\n"); + nonseekable_open(inode, file); file->private_data = inode->i_private; return 0; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8da3c9336111..2002b70e18db 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.2" +#define BATADV_SOURCE_VERSION "2018.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c3578444f3cb..34caf129a9bf 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -854,16 +854,27 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; + /* Select ingoing or outgoing coding node */ + if (in_coding) { + lock = &orig_neigh_node->in_coding_list_lock; + list = &orig_neigh_node->in_coding_list; + } else { + lock = &orig_neigh_node->out_coding_list_lock; + list = &orig_neigh_node->out_coding_list; + } + + spin_lock_bh(lock); + /* Check if nc_node is already added */ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); /* Node found */ if (nc_node) - return nc_node; + goto unlock; nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); if (!nc_node) - return NULL; + goto unlock; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); @@ -872,22 +883,14 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ - spin_lock_bh(lock); kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); + +unlock: spin_unlock_bh(lock); return nc_node; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 1d295da3e342..56a981af5c92 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -904,9 +904,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) batadv_frag_purge_orig(orig_node, NULL); - if (orig_node->bat_priv->algo_ops->orig.free) - orig_node->bat_priv->algo_ops->orig.free(orig_node); - kfree(orig_node->tt_buff); kfree(orig_node); } @@ -1555,107 +1552,3 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; } - -/** - * batadv_orig_hash_add_if() - Add interface to originators in orig_hash - * @hard_iface: hard interface to add (already slave of the soft interface) - * @max_if_num: new number of interfaces - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_algo_ops *bao = bat_priv->algo_ops; - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - u32 i; - int ret; - - /* resize all orig nodes because orig_node->bcast_own(_sum) depend on - * if_num - */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - ret = 0; - if (bao->orig.add_if) - ret = bao->orig.add_if(orig_node, max_if_num); - if (ret == -ENOMEM) - goto err; - } - rcu_read_unlock(); - } - - return 0; - -err: - rcu_read_unlock(); - return -ENOMEM; -} - -/** - * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash - * @hard_iface: hard interface to remove (still slave of the soft interface) - * @max_if_num: new number of interfaces - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_hard_iface *hard_iface_tmp; - struct batadv_orig_node *orig_node; - struct batadv_algo_ops *bao = bat_priv->algo_ops; - u32 i; - int ret; - - /* resize all orig nodes because orig_node->bcast_own(_sum) depend on - * if_num - */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - ret = 0; - if (bao->orig.del_if) - ret = bao->orig.del_if(orig_node, max_if_num, - hard_iface->if_num); - if (ret == -ENOMEM) - goto err; - } - rcu_read_unlock(); - } - - /* renumber remaining batman interfaces _inside_ of orig_hash_lock */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) { - if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE) - continue; - - if (hard_iface == hard_iface_tmp) - continue; - - if (hard_iface->soft_iface != hard_iface_tmp->soft_iface) - continue; - - if (hard_iface_tmp->if_num > hard_iface->if_num) - hard_iface_tmp->if_num--; - } - rcu_read_unlock(); - - hard_iface->if_num = -1; - return 0; - -err: - rcu_read_unlock(); - return -ENOMEM; -} diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3b3f59b881e1..a8b4c7b667ec 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -72,10 +72,6 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); -int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num); -int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 1485263a348b..5db5a0a4c959 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -574,15 +574,20 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) struct batadv_softif_vlan *vlan; int err; + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); - if (!vlan) + if (!vlan) { + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; + } vlan->bat_priv = bat_priv; vlan->vid = vid; @@ -590,17 +595,23 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) atomic_set(&vlan->ap_isolation, 0); + kref_get(&vlan->refcount); + hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + + /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the + * sleeping behavior of the sysfs functions and the fs_reclaim lock + */ err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (err) { - kfree(vlan); + /* ref for the function */ + batadv_softif_vlan_put(vlan); + + /* ref for the list */ + batadv_softif_vlan_put(vlan); return err; } - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - kref_get(&vlan->refcount); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ @@ -833,7 +844,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; - bat_priv->num_ifaces = 0; batadv_nc_init_bat_priv(bat_priv); @@ -1051,6 +1061,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index f2eef43bd2ec..09427fc6494a 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -188,7 +188,8 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_var, net_dev); \ + &bat_priv->_var, net_dev, \ + NULL); \ } #define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ @@ -262,7 +263,9 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ length = __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &hard_iface->_var, net_dev); \ + &hard_iface->_var, \ + hard_iface->soft_iface, \ + net_dev); \ \ batadv_hardif_put(hard_iface); \ return length; \ @@ -356,10 +359,12 @@ __batadv_store_bool_attr(char *buff, size_t count, static int batadv_store_uint_attr(const char *buff, size_t count, struct net_device *net_dev, + struct net_device *slave_dev, const char *attr_name, unsigned int min, unsigned int max, atomic_t *attr) { + char ifname[IFNAMSIZ + 3] = ""; unsigned long uint_val; int ret; @@ -385,8 +390,11 @@ static int batadv_store_uint_attr(const char *buff, size_t count, if (atomic_read(attr) == uint_val) return count; - batadv_info(net_dev, "%s: Changing from: %i to: %lu\n", - attr_name, atomic_read(attr), uint_val); + if (slave_dev) + snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name); + + batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n", + attr_name, ifname, atomic_read(attr), uint_val); atomic_set(attr, uint_val); return count; @@ -397,12 +405,13 @@ static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, void (*post_func)(struct net_device *), const struct attribute *attr, atomic_t *attr_store, - struct net_device *net_dev) + struct net_device *net_dev, + struct net_device *slave_dev) { int ret; - ret = batadv_store_uint_attr(buff, count, net_dev, attr->name, min, max, - attr_store); + ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev, + attr->name, min, max, attr_store); if (post_func && ret) post_func(net_dev); @@ -571,7 +580,7 @@ static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect, attr, &bat_priv->gw.sel_class, - bat_priv->soft_iface); + bat_priv->soft_iface, NULL); } static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, @@ -1090,8 +1099,9 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, if (old_tp_override == tp_override) goto out; - batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n", - "throughput_override", + batadv_info(hard_iface->soft_iface, + "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n", + "throughput_override", net_dev->name, old_tp_override / 10, old_tp_override % 10, tp_override / 10, tp_override % 10); diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c new file mode 100644 index 000000000000..3d57f9981f25 --- /dev/null +++ b/net/batman-adv/trace.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: + * + * Sven Eckelmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h new file mode 100644 index 000000000000..3acda26a30ca --- /dev/null +++ b/net/batman-adv/trace.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: + * + * Sven Eckelmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _NET_BATMAN_ADV_TRACE_H_ + +#include "main.h" + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM batadv + +/* provide dummy function when tracing is disabled */ +#if !defined(CONFIG_BATMAN_ADV_TRACING) + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, ...) \ + static inline void trace_ ## name(proto) {} + +#endif /* CONFIG_BATMAN_ADV_TRACING */ + +#define BATADV_MAX_MSG_LEN 256 + +TRACE_EVENT(batadv_dbg, + + TP_PROTO(struct batadv_priv *bat_priv, + struct va_format *vaf), + + TP_ARGS(bat_priv, vaf), + + TP_STRUCT__entry( + __string(device, bat_priv->soft_iface->name) + __string(driver, KBUILD_MODNAME) + __dynamic_array(char, msg, BATADV_MAX_MSG_LEN) + ), + + TP_fast_assign( + __assign_str(device, bat_priv->soft_iface->name); + __assign_str(driver, KBUILD_MODNAME); + WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), + BATADV_MAX_MSG_LEN, + vaf->fmt, + *vaf->va) >= BATADV_MAX_MSG_LEN); + ), + + TP_printk( + "%s %s %s", + __get_str(driver), + __get_str(device), + __get_str(msg) + ) +); + +#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 12a2b7d21376..d21624c44665 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1613,6 +1613,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, { struct batadv_tt_orig_list_entry *orig_entry; + spin_lock_bh(&tt_global->list_lock); + orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that @@ -1635,11 +1637,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, orig_entry->flags = flags; kref_init(&orig_entry->refcount); - spin_lock_bh(&tt_global->list_lock); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); - spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); sync_flags: @@ -1647,6 +1647,8 @@ sync_flags: out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); + + spin_unlock_bh(&tt_global->list_lock); } /** diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index a637458205d1..40e69c9346d2 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -529,15 +529,20 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, { struct batadv_tvlv_handler *tvlv_handler; + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) + if (!tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; + } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; @@ -547,7 +552,6 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 343d304851a5..45b5592de816 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -167,9 +167,6 @@ struct batadv_hard_iface { /** @list: list node for batadv_hardif_list */ struct list_head list; - /** @if_num: identificator of the interface */ - unsigned int if_num; - /** @if_status: status of the interface for batman-adv */ char if_status; @@ -233,6 +230,20 @@ struct batadv_hard_iface { }; /** + * struct batadv_orig_ifinfo - B.A.T.M.A.N. IV private orig_ifinfo members + */ +struct batadv_orig_ifinfo_bat_iv { + /** + * @bcast_own: bitfield which counts the number of our OGMs this + * orig_node rebroadcasted "back" to us (relative to last_real_seqno) + */ + DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); + + /** @bcast_own_sum: sum of bcast_own */ + u8 bcast_own_sum; +}; + +/** * struct batadv_orig_ifinfo - originator info per outgoing interface */ struct batadv_orig_ifinfo { @@ -257,6 +268,9 @@ struct batadv_orig_ifinfo { /** @batman_seqno_reset: time when the batman seqno window was reset */ unsigned long batman_seqno_reset; + /** @bat_iv: B.A.T.M.A.N. IV private structure */ + struct batadv_orig_ifinfo_bat_iv bat_iv; + /** @refcount: number of contexts the object is used */ struct kref refcount; @@ -339,19 +353,10 @@ struct batadv_orig_node_vlan { */ struct batadv_orig_bat_iv { /** - * @bcast_own: set of bitfields (one per hard-interface) where each one - * counts the number of our OGMs this orig_node rebroadcasted "back" to - * us (relative to last_real_seqno). Every bitfield is - * BATADV_TQ_LOCAL_WINDOW_SIZE bits long. - */ - unsigned long *bcast_own; - - /** @bcast_own_sum: sum of bcast_own */ - u8 *bcast_own_sum; - - /** - * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, - * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count + * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own, + * &batadv_orig_ifinfo_bat_iv.bcast_own_sum, + * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and + * &batadv_neigh_ifinfo_bat_iv.real_packet_count */ spinlock_t ogm_cnt_lock; }; @@ -1597,9 +1602,6 @@ struct batadv_priv { /** @batman_queue_left: number of remaining OGM packet slots */ atomic_t batman_queue_left; - /** @num_ifaces: number of interfaces assigned to this mesh interface */ - unsigned int num_ifaces; - /** @mesh_obj: kobject for sysfs mesh subdirectory */ struct kobject *mesh_obj; @@ -2179,28 +2181,6 @@ struct batadv_algo_neigh_ops { * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) */ struct batadv_algo_orig_ops { - /** - * @free: free the resources allocated by the routing algorithm for an - * orig_node object (optional) - */ - void (*free)(struct batadv_orig_node *orig_node); - - /** - * @add_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to a new hard-interface being added into the mesh - * (optional) - */ - int (*add_if)(struct batadv_orig_node *orig_node, - unsigned int max_if_num); - - /** - * @del_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to an hard-interface being removed from the mesh - * (optional) - */ - int (*del_if)(struct batadv_orig_node *orig_node, - unsigned int max_if_num, unsigned int del_if_num); - #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** @print: print the originator table (optional) */ void (*print)(struct batadv_priv *priv, struct seq_file *seq, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ae91e2d40056..3a7b0773536b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -83,6 +83,7 @@ enum { struct smp_dev { /* Secure Connections OOB data */ + bool local_oob; u8 local_pk[64]; u8 local_rand[16]; bool debug_key; @@ -599,6 +600,8 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) memcpy(rand, smp->local_rand, 16); + smp->local_oob = true; + return 0; } @@ -1785,7 +1788,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ - if (req->oob_flag == SMP_OOB_PRESENT) + if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); /* SMP over BR/EDR requires special treatment */ @@ -1967,7 +1970,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ - if (rsp->oob_flag == SMP_OOB_PRESENT) + if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); smp->prsp[0] = SMP_CMD_PAIRING_RSP; @@ -2697,7 +2700,13 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) * key was set/generated. */ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { - struct smp_dev *smp_dev = chan->data; + struct l2cap_chan *hchan = hdev->smp_data; + struct smp_dev *smp_dev; + + if (!hchan || !hchan->data) + return SMP_UNSPECIFIED; + + smp_dev = hchan->data; tfm_ecdh = smp_dev->tfm_ecdh; } else { @@ -3230,6 +3239,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) return ERR_CAST(tfm_ecdh); } + smp->local_oob = false; smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 502f66349530..a56ed7f2a3a3 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -584,7 +584,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst)) { + if (unlikely(source != fdb->dst && !fdb->is_sticky)) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ @@ -656,6 +656,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags |= NTF_OFFLOADED; if (fdb->added_by_external_learn) ndm->ndm_flags |= NTF_EXT_LEARNED; + if (fdb->is_sticky) + ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; @@ -772,8 +774,10 @@ skip: /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const __u8 *addr, __u16 state, __u16 flags, __u16 vid) + const u8 *addr, u16 state, u16 flags, u16 vid, + u8 ndm_flags) { + u8 is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -789,6 +793,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, return -EINVAL; } + if (is_sticky && (state & NUD_PERMANENT)) + return -EINVAL; + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -832,6 +839,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + + if (is_sticky != fdb->is_sticky) { + fdb->is_sticky = is_sticky; + modified = true; + } + fdb->added_by_user = 1; fdb->used = jiffies; @@ -865,7 +878,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm->ndm_state, - nlh_flags, vid); + nlh_flags, vid, ndm->ndm_flags); spin_unlock_bh(&br->hash_lock); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 11ed2029985f..d21035a17f4c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -181,6 +181,7 @@ struct net_bridge_fdb_entry { struct hlist_node fdb_node; unsigned char is_local:1, is_static:1, + is_sticky:1, added_by_user:1, added_by_external_learn:1, offloaded:1; diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index b82440e1fcb4..a931a71ef6df 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -264,9 +264,6 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) frontpkt = rearpkt; rearpkt = NULL; - err = -ENOMEM; - if (frontpkt == NULL) - goto out; err = -EPROTO; if (cfpkt_add_head(frontpkt, head, 6) < 0) goto out; diff --git a/net/core/dev.c b/net/core/dev.c index ca78dc5a79a3..0b2d777e5b9e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3231,7 +3231,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de while (skb) { struct sk_buff *next = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; @@ -3331,7 +3331,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d for (; skb != NULL; skb = next) { next = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); /* in case skb wont be segmented, point to itself */ skb->prev = skb; @@ -5295,8 +5295,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - list_del(&skb->list); - skb->next = NULL; + skb_list_del_init(skb); napi_gro_complete(skb); napi->gro_hash[index].count--; } @@ -5481,8 +5480,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - list_del(&pp->list); - pp->next = NULL; + skb_list_del_init(pp); napi_gro_complete(pp); napi->gro_hash[hash].count--; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 65fc366a78a4..8c0ed225e280 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2592,7 +2592,7 @@ send_done: if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } return genlmsg_reply(skb, info); @@ -2600,7 +2600,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_resource_put: -err_skb_send_alloc: nlmsg_free(skb); return err; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 9d4e56d97080..96afc55aa61e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2536,6 +2536,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: diff --git a/net/core/filter.c b/net/core/filter.c index 9cc76f134ddb..72db8afb7cb6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2344,7 +2344,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(bytes_sg_total > copy)) return -EINVAL; - page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy)); if (unlikely(!page)) return -ENOMEM; p = page_address(page); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 738c7562e1e0..676f3ad629f9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -423,8 +423,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) - offset += sizeof(((struct gre_full_hdr *) 0)->csum) + - sizeof(((struct gre_full_hdr *) 0)->reserved1); + offset += FIELD_SIZEOF(struct gre_full_hdr, csum) + + FIELD_SIZEOF(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; @@ -446,11 +446,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - offset += sizeof(((struct gre_full_hdr *) 0)->key); + offset += FIELD_SIZEOF(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) - offset += sizeof(((struct pptp_gre_header *) 0)->seq); + offset += FIELD_SIZEOF(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { @@ -477,7 +477,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, u8 *ppp_hdr; if (hdr->flags & GRE_ACK) - offset += sizeof(((struct pptp_gre_header *) 0)->ack); + offset += FIELD_SIZEOF(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 188d693cb251..65a2e820364f 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -162,30 +162,18 @@ __gnet_stats_copy_basic(const seqcount_t *running, } EXPORT_SYMBOL(__gnet_stats_copy_basic); -/** - * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer - * @d: dumping handle - * @cpu: copy statistic per cpu - * @b: basic statistics - * - * Appends the basic statistics to the top level TLV created by - * gnet_stats_start_copy(). - * - * Returns 0 on success or -1 with the statistic lock released - * if the room in the socket buffer was not sufficient. - */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +___gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b, + int type) { struct gnet_stats_basic_packed bstats = {0}; __gnet_stats_copy_basic(running, &bstats, cpu, b); - if (d->compat_tc_stats) { + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { d->tc_stats.bytes = bstats.bytes; d->tc_stats.packets = bstats.packets; } @@ -196,14 +184,61 @@ gnet_stats_copy_basic(const seqcount_t *running, memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), + return gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); } return 0; } + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC); +} EXPORT_SYMBOL(gnet_stats_copy_basic); /** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV + * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic_hw(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC_HW); +} +EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + +/** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle * @rate_est: rate estimator diff --git a/net/core/link_watch.c b/net/core/link_watch.c index e38e641e98d5..7f51efb2b3ab 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -155,7 +155,7 @@ static void linkwatch_do_dev(struct net_device *dev) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); - if (dev->flags & IFF_UP) { + if (dev->flags & IFF_UP && netif_device_present(dev)) { if (netif_carrier_ok(dev)) dev_activate(dev); else diff --git a/net/core/neighbour.c b/net/core/neighbour.c index aa19d86937af..20e0d3308148 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1180,6 +1180,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } + /* Update confirmed timestamp for neighbour entry after we + * received ARP packet even if it doesn't change IP to MAC binding. + */ + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1201,15 +1207,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } - /* Update timestamps only once we know we will make a change to the + /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ - if (new != old || lladdr != neigh->ha) { - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; + if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; - } if (new != old) { neigh_del_timer(neigh); @@ -1277,11 +1280,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->arp_queue_len_bytes = 0; } out: - if (update_isrouter) { - neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? - (neigh->flags | NTF_ROUTER) : - (neigh->flags & ~NTF_ROUTER); - } + if (update_isrouter) + neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); if (notify) @@ -1709,7 +1709,8 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1784,12 +1785,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; + flags &= ~(NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm->ndm_flags & NTF_ROUTER) + flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 57557a6a950c..3219a2932463 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -187,16 +187,16 @@ static void poll_napi(struct net_device *dev) } } -static void netpoll_poll_dev(struct net_device *dev) +void netpoll_poll_dev(struct net_device *dev) { - const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (down_trylock(&ni->dev_lock)) + if (!ni || down_trylock(&ni->dev_lock)) return; if (!netif_running(dev)) { @@ -205,13 +205,8 @@ static void netpoll_poll_dev(struct net_device *dev) } ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) { - up(&ni->dev_lock); - return; - } - - /* Process pending work on NIC */ - ops->ndo_poll_controller(dev); + if (ops->ndo_poll_controller) + ops->ndo_poll_controller(dev); poll_napi(dev); @@ -219,6 +214,7 @@ static void netpoll_poll_dev(struct net_device *dev) zap_completion_queue(); } +EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { @@ -613,8 +609,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); - if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || - !ndev->netdev_ops->ndo_poll_controller) { + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7f6938405fa1..6ac919847ce6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3426,7 +3426,7 @@ xmit_more: net_info_ratelimited("%s xmit error: %d\n", pkt_dev->odevname, ret); pkt_dev->errors++; - /* fallthru */ + /* fall through */ case NETDEV_TX_BUSY: /* Retry it next time */ refcount_dec(&(pkt_dev->skb->users)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 60c928894a78..35162e1b06ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -130,6 +130,12 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { @@ -1016,7 +1022,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ - + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ @@ -1598,7 +1604,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; - if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid)) + if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || @@ -1737,7 +1743,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, - [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, @@ -1845,7 +1851,15 @@ static bool link_dump_filtered(struct net_device *dev, return false; } -static struct net *get_target_net(struct sock *sk, int netnsid) +/** + * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. + * @sk: netlink socket + * @netnsid: network namespace identifier + * + * Returns the network namespace identified by netnsid on success or an error + * pointer on failure. + */ +struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; @@ -1862,6 +1876,7 @@ static struct net *get_target_net(struct sock *sk, int netnsid) } return net; } +EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { @@ -1895,9 +1910,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb->sk, netnsid); + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { tgt_net = net; netnsid = -1; @@ -1984,7 +1999,7 @@ EXPORT_SYMBOL(rtnl_link_get_net); * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD - * 3. IFLA_IF_NETNSID + * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) @@ -1994,10 +2009,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); - if (!tb[IFLA_IF_NETNSID]) + if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); - net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); @@ -2038,13 +2053,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[], return -EOPNOTSUPP; } - if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) + if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; - if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD])) + if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; - if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID])) + if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; @@ -2320,7 +2335,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) return err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { @@ -2763,9 +2778,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } @@ -2810,7 +2825,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, 0U); + __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U); @@ -3173,9 +3188,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } @@ -3260,13 +3275,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; + int type = cb->nlh->nlmsg_type - RTM_BASE; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link **tab; - int type = cb->nlh->nlmsg_type-RTM_BASE; struct rtnl_link *link; rtnl_dumpit_func dumpit; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c996c09d095f..b2c807f67aba 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -939,9 +939,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) WARN_ON_ONCE(!in_task()); - if (!sock_flag(sk, SOCK_ZEROCOPY)) - return NULL; - skb = sock_omalloc(sk, 0, GFP_KERNEL); if (!skb) return NULL; diff --git a/net/core/sock.c b/net/core/sock.c index 3730eb855095..8537b6ca72c5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2332,7 +2332,7 @@ static void __release_sock(struct sock *sk) next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); - skb->next = NULL; + skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index bfd43e8f2c06..d0b3e69c6b39 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1363,7 +1363,7 @@ static int dn_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" " %04hu %03d %02x %-10s %-7s %-7s\n", - dev->name ? dev->name : "???", + dev->name, dn_type2asc(dn_db->parms.mode), 0, 0, dn_db->t3, dn_db->parms.t3, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 4183e4ba27a5..48c41918fb35 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -38,6 +38,9 @@ config NET_DSA_TAG_DSA config NET_DSA_TAG_EDSA bool +config NET_DSA_TAG_GSWIP + bool + config NET_DSA_TAG_KSZ bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 9e4d3536f977..6e721f7a2947 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -9,6 +9,7 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 9f3209ff7ffd..a69c1790bbfc 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -52,6 +52,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_EDSA [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_GSWIP + [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_KSZ [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, #endif @@ -70,6 +73,52 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { [DSA_TAG_PROTO_NONE] = &none_ops, }; +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) +{ + const char *protocol_name[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = "brcm", +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND + [DSA_TAG_PROTO_BRCM_PREPEND] = "brcm-prepend", +#endif +#ifdef CONFIG_NET_DSA_TAG_DSA + [DSA_TAG_PROTO_DSA] = "dsa", +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + [DSA_TAG_PROTO_EDSA] = "edsa", +#endif +#ifdef CONFIG_NET_DSA_TAG_GSWIP + [DSA_TAG_PROTO_GSWIP] = "gswip", +#endif +#ifdef CONFIG_NET_DSA_TAG_KSZ + [DSA_TAG_PROTO_KSZ] = "ksz", +#endif +#ifdef CONFIG_NET_DSA_TAG_LAN9303 + [DSA_TAG_PROTO_LAN9303] = "lan9303", +#endif +#ifdef CONFIG_NET_DSA_TAG_MTK + [DSA_TAG_PROTO_MTK] = "mtk", +#endif +#ifdef CONFIG_NET_DSA_TAG_QCA + [DSA_TAG_PROTO_QCA] = "qca", +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = "trailer", +#endif + [DSA_TAG_PROTO_NONE] = "none", + }; + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(protocol_name) != DSA_TAG_LAST); + + for (i = 0; i < ARRAY_SIZE(dsa_device_ops); i++) + if (ops == dsa_device_ops[i]) + return protocol_name[i]; + + return protocol_name[DSA_TAG_PROTO_NONE]; +}; + const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) { const struct dsa_device_ops *ops; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3964c6f7a7c0..9e4fd04ab53c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -86,6 +86,7 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); bool dsa_schedule_work(struct work_struct *work); +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); /* legacy.c */ #if IS_ENABLED(CONFIG_NET_DSA_LEGACY) @@ -205,6 +206,9 @@ extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ extern const struct dsa_device_ops edsa_netdev_ops; +/* tag_gswip.c */ +extern const struct dsa_device_ops gswip_netdev_ops; + /* tag_ksz.c */ extern const struct dsa_device_ops ksz_netdev_ops; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 42a7b85b84e1..8aa92b09db76 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -392,8 +392,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) } /* Drop our reference to the MDIO bus device */ - if (pd->chip[i].host_dev) - put_device(pd->chip[i].host_dev); + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1c45c1d6d241..3f840b6eea69 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1058,6 +1058,27 @@ static struct device_type dsa_type = { .name = "dsa", }; +static ssize_t tagging_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct dsa_port *dp = dsa_slave_to_port(dev); + + return sprintf(buf, "%s\n", + dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops)); +} +static DEVICE_ATTR_RO(tagging); + +static struct attribute *dsa_slave_attrs[] = { + &dev_attr_tagging.attr, + NULL +}; + +static const struct attribute_group dsa_group = { + .name = "dsa", + .attrs = dsa_slave_attrs, +}; + static void dsa_slave_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) @@ -1353,8 +1374,14 @@ int dsa_slave_create(struct dsa_port *port) goto out_phy; } + ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group); + if (ret) + goto out_unreg; + return 0; +out_unreg: + unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); @@ -1378,6 +1405,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) rtnl_unlock(); dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); + sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group); unregister_netdev(slave_dev); phylink_destroy(dp->pl); free_percpu(p->stats64); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c new file mode 100644 index 000000000000..49e9b73f1be3 --- /dev/null +++ b/net/dsa/tag_gswip.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel / Lantiq GSWIP V2.0 PMAC tag support + * + * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de> + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <net/dsa.h> + +#include "dsa_priv.h" + +#define GSWIP_TX_HEADER_LEN 4 + +/* special tag in TX path header */ +/* Byte 0 */ +#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */ +#define GSWIP_TX_SLPID_CPU 2 +#define GSWIP_TX_SLPID_APP1 3 +#define GSWIP_TX_SLPID_APP2 4 +#define GSWIP_TX_SLPID_APP3 5 +#define GSWIP_TX_SLPID_APP4 6 +#define GSWIP_TX_SLPID_APP5 7 + +/* Byte 1 */ +#define GSWIP_TX_CRCGEN_DIS BIT(7) +#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */ +#define GSWIP_TX_DPID_ELAN 0 +#define GSWIP_TX_DPID_EWAN 1 +#define GSWIP_TX_DPID_CPU 2 +#define GSWIP_TX_DPID_APP1 3 +#define GSWIP_TX_DPID_APP2 4 +#define GSWIP_TX_DPID_APP3 5 +#define GSWIP_TX_DPID_APP4 6 +#define GSWIP_TX_DPID_APP5 7 + +/* Byte 2 */ +#define GSWIP_TX_PORT_MAP_EN BIT(7) +#define GSWIP_TX_PORT_MAP_SEL BIT(6) +#define GSWIP_TX_LRN_DIS BIT(5) +#define GSWIP_TX_CLASS_EN BIT(4) +#define GSWIP_TX_CLASS_SHIFT 0 +#define GSWIP_TX_CLASS_MASK GENMASK(3, 0) + +/* Byte 3 */ +#define GSWIP_TX_DPID_EN BIT(0) +#define GSWIP_TX_PORT_MAP_SHIFT 1 +#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1) + +#define GSWIP_RX_HEADER_LEN 8 + +/* special tag in RX path header */ +/* Byte 7 */ +#define GSWIP_RX_SPPID_SHIFT 4 +#define GSWIP_RX_SPPID_MASK GENMASK(6, 4) + +static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + int err; + u8 *gswip_tag; + + err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN); + if (err) + return NULL; + + skb_push(skb, GSWIP_TX_HEADER_LEN); + + gswip_tag = skb->data; + gswip_tag[0] = GSWIP_TX_SLPID_CPU; + gswip_tag[1] = GSWIP_TX_DPID_ELAN; + gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL; + gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK; + gswip_tag[3] |= GSWIP_TX_DPID_EN; + + return skb; +} + +static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + int port; + u8 *gswip_tag; + + if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN))) + return NULL; + + gswip_tag = skb->data - ETH_HLEN; + + /* Get source port information */ + port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; + + /* remove GSWIP tag */ + skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN); + + return skb; +} + +const struct dsa_device_ops gswip_netdev_ops = { + .xmit = gswip_tag_xmit, + .rcv = gswip_tag_rcv, +}; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index e7857a8ac86d..d14226ecfde4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -260,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, } sub_frag_mem_limit(fq->q.net, sum_truesize); - head->next = NULL; + skb_mark_not_on_list(head); head->dev = ldev; head->tstamp = fq->q.stamp; @@ -463,7 +463,6 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) table[0].data = &ieee802154_lowpan->frags.high_thresh; table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; - table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh; table[1].data = &ieee802154_lowpan->frags.low_thresh; table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 20fda8fb8ffd..1fbe2f815474 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1377,6 +1377,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; + skb_reset_mac_len(skb); } while ((skb = skb->next)); out: diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 82178cc69c96..777fa3b7fb13 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer - * to the start of the CIPSO option on success, NULL if one if not found. + * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) @@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int optlen; int taglen; - for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { + for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { - case IPOPT_CIPSO: - return optptr; case IPOPT_END: return NULL; case IPOPT_NOOP: @@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) default: taglen = optptr[1]; } + if (!taglen || taglen > optlen) + return NULL; + if (optptr[0] == IPOPT_CIPSO) + return optptr; + optlen -= taglen; optptr += taglen; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ea4bd8a52422..44d931a3cd50 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -100,6 +100,15 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, + [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, +}; + +struct inet_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; }; #define IN4_ADDR_HSIZE_SHIFT 8 @@ -1584,13 +1593,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, } static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), + args->flags); if (!nlh) return -EMSGSIZE; @@ -1601,6 +1611,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + goto nla_put_failure; + if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { preferred = ifa->ifa_preferred_lft; valid = ifa->ifa_valid_lft; @@ -1647,7 +1661,16 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct inet_fill_args fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .event = RTM_NEWADDR, + .flags = NLM_F_MULTI, + .netnsid = -1, + }; struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct net *tgt_net = net; int h, s_h; int idx, s_idx; int ip_idx, s_ip_idx; @@ -1660,12 +1683,24 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; + if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, + ifa_ipv4_policy, NULL) >= 0) { + if (tb[IFA_TARGET_NETNSID]) { + fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + + tgt_net = rtnl_get_net_ns_capable(skb->sk, + fillargs.netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + } + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; - head = &net->dev_index_head[h]; + head = &tgt_net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^ + tgt_net->dev_base_seq; hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -1679,10 +1714,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) ifa = ifa->ifa_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - if (inet_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) < 0) { + if (inet_fill_ifaddr(skb, ifa, &fillargs) < 0) { rcu_read_unlock(); goto done; } @@ -1698,6 +1730,8 @@ done: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = ip_idx; + if (fillargs.netnsid >= 0) + put_net(tgt_net); return skb->len; } @@ -1705,8 +1739,14 @@ done: static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { + struct inet_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, + .event = event, + .flags = 0, + .netnsid = -1, + }; struct sk_buff *skb; - u32 seq = nlh ? nlh->nlmsg_seq : 0; int err = -ENOBUFS; struct net *net; @@ -1715,7 +1755,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, if (!skb) goto errout; - err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); + err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2998b0e47d4b..30e2bcc3ef2a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) +{ + bool dev_match = false; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int ret; + + for (ret = 0; ret < fi->fib_nhs; ret++) { + struct fib_nh *nh = &fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; + } + } +#else + if (fi->fib_nh[0].nh_dev == dev) + dev_match = true; +#endif + + return dev_match; +} +EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); - dev_match = false; - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - if (nh->nh_dev == dev) { - dev_match = true; - break; - } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif + dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; return ret; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index b798862b6be5..7efe740c06eb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { + if (!skb_checksum_simple_validate(skb)) { + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + } else if (csum_err) { *csum_err = true; return -EINVAL; } - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); options++; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 330f62353b11..9b0158fa431f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -260,8 +260,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - if (head) - kfree_skb(head); + kfree_skb(head); ipq_put(qp); } @@ -602,6 +601,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, nextp = &fp->next; fp->prev = NULL; memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -623,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, sub_frag_mem_limit(qp->q.net, head->truesize); *nextp = NULL; - head->next = NULL; + skb_mark_not_on_list(head); head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; @@ -822,7 +822,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[0].extra1 = &net->ipv4.frags.low_thresh; - table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ae714aecc31c..c3385a84f8ff 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,6 +178,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info, if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); else itn = net_generic(net, ipgre_net_id); @@ -229,13 +232,10 @@ static void gre_err(struct sk_buff *skb, u32 info) const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; - bool csum_err = false; - if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), - iph->ihl * 4) < 0) { - if (!csum_err) /* ignore csum errors. */ - return; - } + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), + iph->ihl * 4) < 0) + return; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, @@ -328,6 +328,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } + return PACKET_REJECT; + drop: kfree_skb(skb); return PACKET_RCVD; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3196cf58f418..35a786c0aaa0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head) struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { - list_del(&skb->list); - /* Handle ip{6}_forward case, as sch_direct_xmit have - * another kind of SKB-list usage (see validate_xmit_skb_list) - */ - skb->next = NULL; + skb_list_del_init(skb); dst_input(skb); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9c4e72e9c60a..c09219e7f230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *nskb = segs->next; int err; - segs->next = NULL; + skb_mark_not_on_list(segs); err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) @@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb = frag; frag = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } if (err == 0) { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..284a22154b4e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,6 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); + unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -636,6 +637,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; + /* ensure we can access the inner net header, for several users below */ + if (skb->protocol == htons(ETH_P_IP)) + inner_nhdr_len = sizeof(struct iphdr); + else if (skb->protocol == htons(ETH_P_IPV6)) + inner_nhdr_len = sizeof(struct ipv6hdr); + if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) + goto tx_error; + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d9504adc47b3..184bf2e0a1ed 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -106,6 +106,10 @@ config NF_NAT_IPV4 if NF_NAT_IPV4 +config NF_NAT_MASQUERADE_IPV4 + bool + +if NF_TABLES config NFT_CHAIN_NAT_IPV4 depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables nat chain support" @@ -115,9 +119,6 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. -config NF_NAT_MASQUERADE_IPV4 - bool - config NFT_MASQ_IPV4 tristate "IPv4 masquerading support for nf_tables" depends on NF_TABLES_IPV4 @@ -135,6 +136,7 @@ config NFT_REDIR_IPV4 help This is the expression that provides IPv4 redirect support for nf_tables. +endif # NF_TABLES config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 12843c9ef142..0b10d8812828 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; - bool dev_match; int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) return false; } - dev_match = false; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - - if (nh->nh_dev == dev) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif - return dev_match || flags & XT_RPFILTER_LOOSE; + return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE; } static bool diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index e50976e3c213..94eb25bc8d7e 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi4_iif = LOOPBACK_IFINDEX, }; const struct net_device *oif; - struct net_device *found; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int i; -#endif + const struct net_device *found; /* * Do not set flowi4_oif, it restricts results (for example, asking @@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (!oif) { found = FIB_RES_DEV(res); - goto ok; - } - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (i = 0; i < res.fi->fib_nhs; i++) { - struct fib_nh *nh = &res.fi->fib_nh[i]; + } else { + if (!fib_info_nh_uses_dev(res.fi, oif)) + return; - if (nh->nh_dev == oif) { - found = nh->nh_dev; - goto ok; - } + found = oif; } - return; -#else - found = FIB_RES_DEV(res); - if (found != oif) - return; -#endif -ok: + switch (priv->result) { case NFT_FIB_RESULT_OIF: *dest = found->ifindex; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c3387dfd725b..606f868d9f3f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); + return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4235c098fd..69c236943f56 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1185,7 +1185,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; - if (flags & MSG_ZEROCOPY && size) { + if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { if (sk->sk_state != TCP_ESTABLISHED) { err = -EINVAL; goto out_err; @@ -1295,7 +1295,7 @@ new_segment: copy = size_goal; /* All packets are restored as if they have - * already been sent. skb_mstamp isn't set to + * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 02ff2dde9609..a5786e3e2c16 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +static const int bbr_pacing_marging_percent = 1; + /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting @@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { unsigned int mss = tcp_sk(sk)->mss_cache; - if (!tcp_needs_internal_pacing(sk)) - mss = tcp_mss_to_mtu(sk, mss); rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC; + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent); return rate >> BW_SCALE; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 62508a2f9b21..d703a0b3b6a2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1305,7 +1305,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1580,7 +1580,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); @@ -3103,7 +3103,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { - last_ackt = skb->skb_mstamp; + last_ackt = tcp_skb_timestamp_us(skb); WARN_ON_ONCE(last_ackt == 0); if (!first_ackt) first_ackt = last_ackt; @@ -3121,7 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3215,7 +3215,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, + tcp_skb_timestamp_us(skb))) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -6380,8 +6381,8 @@ static bool tcp_syn_flood_action(const struct sock *sk, if (!queue->synflood_warned && net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) - pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); + net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); return want_cookie; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 09547ef9c4c6..1f2496e8620d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) BUG_ON(!skb); tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 597dbd749f05..fe7855b090e4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,22 @@ #include <trace/events/tcp.h> +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_ns(); + + /* departure time for next data packet */ + if (val > tp->tcp_wstamp_ns) + tp->tcp_wstamp_ns = val; + + val = div_u64(val, NSEC_PER_USEC); + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -977,28 +993,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +static void tcp_internal_pacing(struct sock *sk) { - u64 len_ns; - u32 rate; - if (!tcp_needs_internal_pacing(sk)) return; - rate = sk->sk_pacing_rate; - if (!rate || rate == ~0U) - return; - - len_ns = (u64)skb->len * NSEC_PER_SEC; - do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, - ktime_add_ns(ktime_get(), len_ns), + ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb) { - skb->skb_mstamp = tp->tcp_mstamp; + struct tcp_sock *tp = tcp_sk(sk); + + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + if (sk->sk_pacing_status != SK_PACING_NONE) { + u32 rate = sk->sk_pacing_rate; + + /* Original sch_fq does not pace first 10 MSS + * Note that tp->data_segs_out overflows after 2^32 packets, + * this is a minor annoyance. + */ + if (rate != ~0U && rate && tp->data_segs_out >= 10) { + tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate); + + tcp_internal_pacing(sk); + } + } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } @@ -1045,7 +1067,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; } - skb->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1137,7 +1159,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; - tcp_internal_pacing(sk, skb); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1149,8 +1170,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); - /* Our usage of tstamp should remain private */ - skb->tstamp = 0; + /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), @@ -1163,7 +1183,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = net_xmit_eval(err); } if (!err && oskb) { - tcp_update_skb_after_send(tp, oskb); + tcp_update_skb_after_send(sk, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1966,7 +1986,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); + age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -2312,7 +2332,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb); goto repair; /* Skip network transmission */ } @@ -2887,7 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } tcp_skb_tsorted_restore(skb); if (!err) { - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb); tcp_rate_skb_sent(sk, skb); } } else { @@ -3205,10 +3225,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif - skb->skb_mstamp = tcp_clock_us(); + skb->skb_mstamp_ns = tcp_clock_ns(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3424,7 +3444,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp = syn_data->skb_mstamp; + syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 4dff40dad4dc..baed2186c7c6 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) * bandwidth estimate. */ if (!tp->packets_out) { - tp->first_tx_mstamp = skb->skb_mstamp; - tp->delivered_mstamp = skb->skb_mstamp; + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; } TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; @@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta( - skb->skb_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = skb->skb_mstamp; } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c81aadff769b..fdb715bdd2d1 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + if (!tcp_rack_sent_after(tp->rack.mstamp, + tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7fdf222a0bdf..4f661e178da8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(skb); if (!start_ts) - skb->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; @@ -758,7 +758,7 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); - hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, + hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_TAI, HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f4e35b2ff8b8..7d69dd6fa7e8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2124,6 +2124,28 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, inet_compute_pseudo); } +/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and + * return code conversion for ip layer consumption + */ +static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, + struct udphdr *uh) +{ + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + + ret = udp_queue_rcv_skb(sk, skb); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; +} + /* * All we need to do is get the socket, and then do a checksum. */ @@ -2170,14 +2192,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp_sk_rx_dst_set(sk, dst); - ret = udp_queue_rcv_skb(sk, skb); + ret = udp_unicast_rcv_skb(sk, skb, uh); sock_put(sk); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; + return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) @@ -2185,22 +2202,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk) { - int ret; - - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - inet_compute_pseudo); - - ret = udp_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; - } + if (sk) + return udp_unicast_rcv_skb(sk, skb, uh); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d51a8c0b3372..a9a317322388 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, if (addr_type == IPV6_ADDR_ANY || addr_type & IPV6_ADDR_MULTICAST || (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); @@ -4201,7 +4202,6 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) p++; continue; } - state->offset++; return ifa; } @@ -4225,13 +4225,12 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, return ifa; } + state->offset = 0; while (++state->bucket < IN6_ADDR_HSIZE) { - state->offset = 0; hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; - state->offset++; return ifa; } } @@ -4491,6 +4490,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .len = sizeof(u32) }, [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, + [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, }; static int @@ -4793,19 +4793,32 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; +}; + static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet6_fill_args *args) { struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + goto error; + if (!((ifa->flags&IFA_F_PERMANENT) && (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; @@ -4855,7 +4868,7 @@ error: } static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, - u32 portid, u32 seq, int event, u16 flags) + struct inet6_fill_args *args) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -4864,10 +4877,15 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + return -EMSGSIZE; + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, @@ -4881,7 +4899,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, } static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; @@ -4891,10 +4909,15 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + return -EMSGSIZE; + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, @@ -4916,8 +4939,14 @@ enum addr_type_t { /* called with rcu_read_lock() */ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type, - int s_ip_idx, int *p_ip_idx) + int s_ip_idx, int *p_ip_idx, int netnsid) { + struct inet6_fill_args fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .netnsid = netnsid, + }; struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; int err = 1; @@ -4927,16 +4956,13 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, switch (type) { case UNICAST_ADDR: { struct inet6_ifaddr *ifa; + fillargs.event = RTM_NEWADDR; /* unicast address incl. temp addr */ list_for_each_entry(ifa, &idev->addr_list, if_list) { if (++ip_idx < s_ip_idx) continue; - err = inet6_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWADDR, - NLM_F_MULTI); + err = inet6_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) break; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -4944,31 +4970,26 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, break; } case MULTICAST_ADDR: + fillargs.event = RTM_GETMULTICAST; + /* multicast address */ for (ifmca = idev->mc_list; ifmca; ifmca = ifmca->next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifmcaddr(skb, ifmca, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_GETMULTICAST, - NLM_F_MULTI); + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); if (err < 0) break; } break; case ANYCAST_ADDR: + fillargs.event = RTM_GETANYCAST; /* anycast address */ for (ifaca = idev->ac_list; ifaca; ifaca = ifaca->aca_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifacaddr(skb, ifaca, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_GETANYCAST, - NLM_F_MULTI); + err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); if (err < 0) break; } @@ -4985,6 +5006,9 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct net *tgt_net = net; + int netnsid = -1; int h, s_h; int idx, ip_idx; int s_idx, s_ip_idx; @@ -4996,11 +5020,22 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; + if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, + ifa_ipv6_policy, NULL) >= 0) { + if (tb[IFA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + + tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + } + rcu_read_lock(); - cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; + cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; - head = &net->dev_index_head[h]; + head = &tgt_net->dev_index_head[h]; hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -5012,7 +5047,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, goto cont; if (in6_dump_addrs(idev, skb, cb, type, - s_ip_idx, &ip_idx) < 0) + s_ip_idx, &ip_idx, netnsid) < 0) goto done; cont: idx++; @@ -5023,6 +5058,8 @@ done: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = ip_idx; + if (netnsid >= 0) + put_net(tgt_net); return skb->len; } @@ -5053,6 +5090,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); + struct inet6_fill_args fillargs = { + .portid = NETLINK_CB(in_skb).portid, + .seq = nlh->nlmsg_seq, + .event = RTM_NEWADDR, + .flags = 0, + .netnsid = -1, + }; + struct net *tgt_net = net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL, *peer; @@ -5066,15 +5111,24 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (tb[IFA_TARGET_NETNSID]) { + fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk, + fillargs.netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); if (!addr) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(tgt_net, ifm->ifa_index); - ifa = ipv6_get_ifaddr(net, addr, dev, 1); + ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1); if (!ifa) { err = -EADDRNOTAVAIL; goto errout; @@ -5086,20 +5140,22 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout_ifa; } - err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWADDR, 0); + err = inet6_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout_ifa; } - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid); errout_ifa: in6_ifa_put(ifa); errout: if (dev) dev_put(dev); + if (fillargs.netnsid >= 0) + put_net(tgt_net); + return err; } @@ -5107,13 +5163,20 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; struct net *net = dev_net(ifa->idev->dev); + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + }; int err = -ENOBUFS; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; - err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); + err = inet6_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9a4261e50272..e9c8cfdf4b4c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -209,6 +209,7 @@ lookup_protocol: np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; + np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; @@ -467,12 +468,10 @@ void inet6_destroy_sock(struct sock *sk) /* Release rx options */ skb = xchg(&np->pktoptions, NULL); - if (skb) - kfree_skb(skb); + kfree_skb(skb); skb = xchg(&np->rxpmtu, NULL); - if (skb) - kfree_skb(skb); + kfree_skb(skb); /* Free flowlabels */ fl6_free_socklist(sk); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index e493b041d4ac..515adbdba1d2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); - const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; - int grehlen = sizeof(*greh); + struct tnl_ptk_info tpi; struct ip6_tnl *t; - int key_off = 0; - __be16 flags; - __be32 key; - if (!pskb_may_pull(skb, offset + grehlen)) - return; - greh = (const struct gre_base_hdr *)(skb->data + offset); - flags = greh->flags; - if (flags & (GRE_VERSION | GRE_ROUTING)) + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), + offset) < 0) return; - if (flags & GRE_CSUM) - grehlen += 4; - if (flags & GRE_KEY) { - key_off = grehlen + offset; - grehlen += 4; - } - if (!pskb_may_pull(skb, offset + grehlen)) - return; ipv6h = (const struct ipv6hdr *)skb->data; - greh = (const struct gre_base_hdr *)(skb->data + offset); - key = key_off ? *(__be32 *)(skb->data + key_off) : 0; - t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, - key, greh->protocol); + tpi.key, tpi.proto); if (!t) return; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6242682be876..96577e742afd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, */ if ((ipv6_addr_loopback(&hdr->saddr) || ipv6_addr_loopback(&hdr->daddr)) && - !(dev->flags & IFF_LOOPBACK)) + !(dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev)) goto err; /* RFC4291 Errata ID: 3480 diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 37ff4805b20c..c7e495f12011 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -115,6 +115,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; + skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 16f200f06500..89e0d5118afe 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -219,12 +219,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); consume_skb(skb); skb = skb2; - /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, - * it is safe to call in our context (socket lock not held) - */ - skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -727,7 +725,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb = frag; frag = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } kfree(tmp_hdr); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 419960b0ba16..a0b6932c3afd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1234,7 +1234,7 @@ static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; int encap_limit = -1; struct flowi6 fl6; __u8 dsfield; @@ -1242,6 +1242,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + /* ensure we can access the full inner ip header */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -1; + + iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); tproto = READ_ONCE(t->parms.proto); @@ -1306,7 +1311,7 @@ static inline int ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6hdr *ipv6h; int encap_limit = -1; __u16 offset; struct flowi6 fl6; @@ -1315,6 +1320,10 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + return -1; + + ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || ip6_tnl_addr_conflict(t, ipv6h)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c0cac9cc3a28..381ce38940ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -674,6 +674,13 @@ done: retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } + case IPV6_MULTICAST_ALL: + if (optlen < sizeof(int)) + goto e_inval; + np->mc_all = valbool; + retv = 0; + break; + case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: { @@ -1266,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->mcast_oif; break; + case IPV6_MULTICAST_ALL: + val = np->mc_all; + break; + case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) np->ucast_oif); break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4ae54aaca373..6895e1dc0b03 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, } if (!mc) { rcu_read_unlock(); - return true; + return np->mc_all; } read_lock(&mc->sflock); psl = mc->sflist; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 2a14d8b65924..b8ac369f98ad 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -445,11 +445,12 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; + fp->sk = NULL; } sub_frag_mem_limit(fq->q.net, head->truesize); head->ignore_df = 1; - head->next = NULL; + skb_mark_not_on_list(head); head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c5b4f79296e..5c3c92713096 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -145,7 +145,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) - goto err; + goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { @@ -162,20 +162,20 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) - goto err; + goto discard_fq; fq->q.len = end; } } if (end == offset) - goto err; + goto discard_fq; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) - goto err; + goto discard_fq; if (pskb_trim_rcsum(skb, end - offset)) - goto err; + goto discard_fq; /* Find out which fragments are in front and at the back of us * in the chain of fragments so far. We must know where to put @@ -388,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } sub_frag_mem_limit(fq->q.net, sum_truesize); - head->next = NULL; + skb_mark_not_on_list(head); head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); @@ -418,6 +418,7 @@ out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); + inet_frag_kill(&fq->q); return -1; } @@ -553,7 +554,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[0].data = &net->ipv6.frags.high_thresh; table[0].extra1 = &net->ipv6.frags.low_thresh; - table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0fa62acc923c..d28f83e01593 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -364,11 +364,14 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; - dst_destroy_metrics_generic(dst); + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) + kfree(p); + rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -946,8 +949,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { - rt->dst.flags |= fib6_info_dst_flags(ort); - if (ort->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; @@ -978,6 +979,10 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } } /* Caller must already hold reference to @ort */ @@ -995,7 +1000,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = ort->fib6_src; #endif - rt->rt6i_prefsrc = ort->fib6_prefsrc; } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, @@ -1449,11 +1453,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (ort->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif - - /* Update rt6i_prefsrc as it could be changed - * in rt6_remove_prefsrc() - */ - nrt->rt6i_prefsrc = ort->fib6_prefsrc; /* rt6_mtu_change() might lower mtu on ort. * Only insert this exception route if its mtu * is less than ort's mtu value. @@ -1635,25 +1634,6 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } -static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) -{ - struct rt6_exception_bucket *bucket; - struct rt6_exception *rt6_ex; - int i; - - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - - if (bucket) { - for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { - rt6_ex->rt6i->rt6i_prefsrc.plen = 0; - } - bucket++; - } - } -} - static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { @@ -2098,7 +2078,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, { bool any_src; - if (rt6_need_strict(&fl6->daddr)) { + if (ipv6_addr_type(&fl6->daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; dst = l3mdev_link_scope_lookup(net, fl6); @@ -3793,8 +3774,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; - /* need to update cache as well */ - rt6_exceptions_remove_prefsrc(rt); spin_unlock_bh(&rt6_exception_lock); } return 0; @@ -4668,20 +4647,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rtmsg *rtm; + struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6key *rt6_dst, *rt6_src; + u32 *pmetrics, table, rt6_flags; struct nlmsghdr *nlh; + struct rtmsg *rtm; long expires = 0; - u32 *pmetrics; - u32 table; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; + if (rt6) { + rt6_dst = &rt6->rt6i_dst; + rt6_src = &rt6->rt6i_src; + rt6_flags = rt6->rt6i_flags; + } else { + rt6_dst = &rt->fib6_dst; + rt6_src = &rt->fib6_src; + rt6_flags = rt->fib6_flags; + } + rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; - rtm->rtm_dst_len = rt->fib6_dst.plen; - rtm->rtm_src_len = rt->fib6_src.plen; + rtm->rtm_dst_len = rt6_dst->plen; + rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; @@ -4696,7 +4686,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; - if (rt->fib6_flags & RTF_CACHE) + if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { @@ -4704,7 +4694,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { @@ -4712,12 +4702,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE - if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { + if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) @@ -4752,7 +4742,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ - if (rt->fib6_nsiblings) { + if (rt6) { + if (rt6_flags & RTF_GATEWAY && + nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) + goto nla_put_failure; + + if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) + goto nla_put_failure; + } else if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -4775,7 +4772,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->fib6_flags & RTF_EXPIRES) { + if (rt6_flags & RTF_EXPIRES) { expires = dst ? dst->expires : rt->expires; expires -= jiffies; } @@ -4783,7 +4780,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; - if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 83f4c77c79d8..28c4aa5078fc 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -752,6 +752,28 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) } } +/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and + * return code conversion for ip layer consumption + */ +static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, + struct udphdr *uh) +{ + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + + ret = udpv6_queue_rcv_skb(sk, skb); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -803,13 +825,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp6_sk_rx_dst_set(sk, dst); - ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); + if (!uh->check && !udp_sk(sk)->no_check6_rx) { + sock_put(sk); + goto report_csum_error; + } - /* a return value > 0 means to resubmit the input */ - if (ret > 0) - return ret; - return 0; + ret = udp6_unicast_rcv_skb(sk, skb, uh); + sock_put(sk); + return ret; } /* @@ -822,30 +845,13 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* Unicast */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { - int ret; - - if (!uh->check && !udp_sk(sk)->no_check6_rx) { - udp6_csum_zero_error(skb); - goto csum_error; - } - - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); - - ret = udpv6_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input */ - if (ret > 0) - return ret; - - return 0; + if (!uh->check && !udp_sk(sk)->no_check6_rx) + goto report_csum_error; + return udp6_unicast_rcv_skb(sk, skb, uh); } - if (!uh->check) { - udp6_csum_zero_error(skb); - goto csum_error; - } + if (!uh->check) + goto report_csum_error; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; @@ -866,6 +872,9 @@ short_packet: ulen, skb->len, daddr, ntohs(uh->dest)); goto discard; + +report_csum_error: + udp6_csum_zero_error(skb); csum_error: __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a21d8ed0a325..5b68ee908107 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -48,7 +48,7 @@ static struct iucv_interface *pr_iucv; static const u8 iprm_shutdown[8] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; -#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class)) +#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class) #define __iucv_sock_wait(sk, condition, timeo, ret) \ do { \ @@ -351,20 +351,28 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message)); skb->dev = iucv->hs_dev; - if (!skb->dev) - return -ENODEV; - if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) - return -ENETDOWN; + if (!skb->dev) { + err = -ENODEV; + goto err_free; + } + if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) { + err = -ENETDOWN; + goto err_free; + } if (skb->len > skb->dev->mtu) { - if (sock->sk_type == SOCK_SEQPACKET) - return -EMSGSIZE; - else - skb_trim(skb, skb->dev->mtu); + if (sock->sk_type == SOCK_SEQPACKET) { + err = -EMSGSIZE; + goto err_free; + } + skb_trim(skb, skb->dev->mtu); } skb->protocol = cpu_to_be16(ETH_P_AF_IUCV); nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return -ENOMEM; + if (!nskb) { + err = -ENOMEM; + goto err_free; + } + skb_queue_tail(&iucv->send_skb_q, nskb); err = dev_queue_xmit(skb); if (net_xmit_eval(err)) { @@ -375,6 +383,10 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, WARN_ON(atomic_read(&iucv->msg_recv) < 0); } return net_xmit_eval(err); + +err_free: + kfree_skb(skb); + return err; } static struct sock *__iucv_get_sock_by_name(char *nm) @@ -1167,7 +1179,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, err = afiucv_hs_send(&txmsg, sk, skb, 0); if (err) { atomic_dec(&iucv->msg_sent); - goto fail; + goto out; } } else { /* Classic VM IUCV transport */ skb_queue_tail(&iucv->send_skb_q, skb); @@ -2155,8 +2167,8 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, struct sock *sk; struct iucv_sock *iucv; struct af_iucv_trans_hdr *trans_hdr; + int err = NET_RX_SUCCESS; char nullstring[8]; - int err = 0; if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) { WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d", @@ -2254,7 +2266,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, err = afiucv_hs_callback_rx(sk, skb); break; default: - ; + kfree_skb(skb); } return err; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 8f7ef167c45a..eb502c6290c2 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -1874,7 +1874,7 @@ static void iucv_pm_complete(struct device *dev) * Returns 0 if there are still iucv pathes defined * 1 if there are no iucv pathes defined */ -int iucv_path_table_empty(void) +static int iucv_path_table_empty(void) { int i; diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 260b3dc1b4a2..64d4bef04e73 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap) list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); - synchronize_rcu(); - - kfree(sap); + kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c6bfd4019d44..a0ca27aeb732 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2077,6 +2077,7 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, idx = sdata->fragment_next; for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) { struct ieee80211_hdr *f_hdr; + struct sk_buff *f_skb; idx--; if (idx < 0) @@ -2088,7 +2089,8 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, entry->last_frag + 1 != frag) continue; - f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data; + f_skb = __skb_peek(&entry->skb_list); + f_hdr = (struct ieee80211_hdr *) f_skb->data; /* * Check ftype and addresses are equal, else check next fragment diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7a4de6d618b1..8fbe6cdbe255 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1533,10 +1533,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support Ethernet, IPGRE, SIT and IPIP devices */ + + /* For now just support Ethernet, IPGRE, IP6GRE, SIT and + * IPIP devices + */ if (dev->type == ARPHRD_ETHER || dev->type == ARPHRD_LOOPBACK || dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || dev->type == ARPHRD_TUNNEL) { mdev = mpls_add_dev(dev); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 71709c104081..f61c306de1d0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -771,13 +771,13 @@ config NETFILTER_XT_TARGET_CHECKSUM depends on NETFILTER_ADVANCED ---help--- This option adds a `CHECKSUM' target, which can be used in the iptables mangle - table. + table to work around buggy DHCP clients in virtualized environments. - You can use this target to compute and fill in the checksum in - a packet that lacks a checksum. This is particularly useful, - if you need to work around old applications such as dhcp clients, - that do not work well with checksum offloads, but don't want to disable - checksum offload in your device. + Some old DHCP clients drop packets because they are not aware + that the checksum would normally be offloaded to hardware and + thus should be considered valid. + This target can be used to fill in the checksum using iptables + when such packets are sent via a virtual network device. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 9f14b0df6960..51c5d7eec0a3 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -776,9 +776,26 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { }; #endif +static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) +{ + u8 nfproto = (unsigned long)_nfproto; + + if (nf_ct_l3num(ct) != nfproto) + return 0; + + if (nf_ct_protonum(ct) == IPPROTO_TCP && + ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED) { + ct->proto.tcp.seen[0].td_maxwin = 0; + ct->proto.tcp.seen[1].td_maxwin = 0; + } + + return 0; +} + static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + bool fixup_needed = false; int err = 0; mutex_lock(&nf_ct_proto_mutex); @@ -798,6 +815,8 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) ARRAY_SIZE(ipv4_conntrack_ops)); if (err) cnet->users4 = 0; + else + fixup_needed = true; break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: @@ -814,6 +833,8 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) ARRAY_SIZE(ipv6_conntrack_ops)); if (err) cnet->users6 = 0; + else + fixup_needed = true; break; #endif default: @@ -822,6 +843,11 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) } out_unlock: mutex_unlock(&nf_ct_proto_mutex); + + if (fixup_needed) + nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup, + (void *)(unsigned long)nfproto, 0, 0); + return err; } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 8c58f96b59e7..f3f91ed2c21a 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -675,7 +675,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) } #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -697,6 +697,8 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } + + timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; return 0; } @@ -726,7 +728,7 @@ dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL /* template, data assigned later */ @@ -827,6 +829,11 @@ static int dccp_init_net(struct net *net, u_int16_t proto) dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + + /* timeouts[0] is unused, make it same as SYN_SENT so + * ->timeouts[0] contains 'new' timeout, like udp or icmp. + */ + dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; } return dccp_kmemdup_sysctl_table(net, pn, dn); @@ -856,7 +863,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = dccp_timeout_nlattr_to_obj, .obj_to_nlattr = dccp_timeout_obj_to_nlattr, @@ -864,7 +871,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, .nla_policy = dccp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = dccp_init_net, .get_net_proto = dccp_get_net_proto, }; @@ -889,7 +896,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = dccp_timeout_nlattr_to_obj, .obj_to_nlattr = dccp_timeout_obj_to_nlattr, @@ -897,7 +904,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, .nla_policy = dccp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = dccp_init_net, .get_net_proto = dccp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index ac4a0b296dcd..1df3244ecd07 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -70,7 +70,7 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, return ret; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -113,7 +113,7 @@ static const struct nla_policy generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table generic_sysctl_table[] = { @@ -164,7 +164,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .pkt_to_tuple = generic_pkt_to_tuple, .packet = generic_packet, .new = generic_new, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, .obj_to_nlattr = generic_timeout_obj_to_nlattr, @@ -172,7 +172,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .obj_size = sizeof(unsigned int), .nla_policy = generic_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = generic_init_net, .get_net_proto = generic_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index d1632252bf5b..650eb4fba2c5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -285,7 +285,7 @@ static void gre_destroy(struct nf_conn *ct) nf_ct_gre_keymap_destroy(master); } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -334,7 +334,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static int gre_init_net(struct net *net, u_int16_t proto) { @@ -367,7 +367,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = gre_timeout_nlattr_to_obj, .obj_to_nlattr = gre_timeout_obj_to_nlattr, @@ -375,7 +375,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .obj_size = sizeof(unsigned int) * GRE_CT_MAX, .nla_policy = gre_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .net_id = &proto_gre_net_id, .init_net = gre_init_net, }; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 036670b38282..43c7e1a217b9 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -273,7 +273,7 @@ static unsigned int icmp_nlattr_tuple_size(void) } #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -313,7 +313,7 @@ static const struct nla_policy icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table icmp_sysctl_table[] = { @@ -374,7 +374,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, @@ -382,7 +382,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = icmp_init_net, .get_net_proto = icmp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index bed07b998a10..97e40f77d678 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -274,7 +274,7 @@ static unsigned int icmpv6_nlattr_tuple_size(void) } #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -314,7 +314,7 @@ static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table icmpv6_sysctl_table[] = { @@ -373,7 +373,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, @@ -381,7 +381,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = icmpv6_init_net, .get_net_proto = icmpv6_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 8d1e085fc14a..e4d738d34cd0 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -591,7 +591,7 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) } #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -613,6 +613,8 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } + + timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } @@ -644,7 +646,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL @@ -743,6 +745,11 @@ static int sctp_init_net(struct net *net, u_int16_t proto) for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; + + /* timeouts[0] is unused, init it so ->timeouts[0] contains + * 'new' timeout, like udp or icmp. + */ + sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } return sctp_kmemdup_sysctl_table(pn, sn); @@ -773,7 +780,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, @@ -781,7 +788,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = sctp_init_net, .get_net_proto = sctp_get_net_proto, }; @@ -806,7 +813,8 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, @@ -814,8 +822,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ -#endif +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = sctp_init_net, .get_net_proto = sctp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index d80d322b9d8b..b4bdf9eda7b7 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1279,7 +1279,7 @@ static unsigned int tcp_nlattr_tuple_size(void) } #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -1301,6 +1301,7 @@ static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], timeouts[TCP_CONNTRACK_SYN_SENT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; } + if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { timeouts[TCP_CONNTRACK_SYN_RECV] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; @@ -1341,6 +1342,8 @@ static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], timeouts[TCP_CONNTRACK_UNACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; } + + timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; return 0; } @@ -1391,7 +1394,7 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table tcp_sysctl_table[] = { @@ -1518,6 +1521,10 @@ static int tcp_init_net(struct net *net, u_int16_t proto) for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) tn->timeouts[i] = tcp_timeouts[i]; + /* timeouts[0] is unused, make it same as SYN_SENT so + * ->timeouts[0] contains 'new' timeout, like udp or icmp. + */ + tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; tn->tcp_loose = nf_ct_tcp_loose; tn->tcp_be_liberal = nf_ct_tcp_be_liberal; tn->tcp_max_retrans = nf_ct_tcp_max_retrans; @@ -1551,7 +1558,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, @@ -1560,7 +1567,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; @@ -1586,7 +1593,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, @@ -1595,7 +1602,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7a1b8988a931..3065fb8ef91b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -171,7 +171,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> @@ -221,7 +221,7 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, }; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table udp_sysctl_table[] = { @@ -292,7 +292,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, @@ -300,7 +300,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; @@ -321,7 +321,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, @@ -329,7 +329,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; @@ -350,7 +350,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, @@ -358,7 +358,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; @@ -379,7 +379,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, @@ -387,10 +387,9 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif -#include <net/netfilter/nf_conntrack_timeout.h> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1dca5683f59f..2cfb173cd0b2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4637,6 +4637,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, } set->ndeact++; + nft_set_elem_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index d46a236cdf31..a30f8ba4b89a 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -489,8 +489,8 @@ err: return err; } -static struct ctnl_timeout * -ctnl_timeout_find_get(struct net *net, const char *name) +static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, + const char *name) { struct ctnl_timeout *timeout, *matching = NULL; @@ -509,7 +509,7 @@ ctnl_timeout_find_get(struct net *net, const char *name) break; } err: - return matching; + return matching ? &matching->timeout : NULL; } static void ctnl_timeout_put(struct nf_ct_timeout *t) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ea4ba551abb2..43041f087eb3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -233,6 +233,7 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) int err; if (verdict == NF_ACCEPT || + verdict == NF_REPEAT || verdict == NF_STOP) { rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); @@ -764,7 +765,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, return ret; } - skb->next = NULL; + skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 26a8baebd072..5dd87748afa8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -799,7 +799,7 @@ err: } struct nft_ct_timeout_obj { - struct nf_conn *tmpl; + struct nf_ct_timeout *timeout; u8 l4proto; }; @@ -809,26 +809,42 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); - struct sk_buff *skb = pkt->skb; + struct nf_conn_timeout *timeout; + const unsigned int *values; + + if (priv->l4proto != pkt->tprot) + return; - if (ct || - priv->l4proto != pkt->tprot) + if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; - nf_ct_set(skb, priv->tmpl, IP_CT_NEW); + timeout = nf_ct_timeout_find(ct); + if (!timeout) { + timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); + if (!timeout) { + regs->verdict.code = NF_DROP; + return; + } + } + + rcu_assign_pointer(timeout->timeout, priv->timeout); + + /* adjust the timeout as per 'new' state. ct is unconfirmed, + * so the current timestamp must not be added. + */ + values = nf_ct_timeout_data(timeout); + if (values) + nf_ct_refresh(ct, pkt->skb, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; - struct nf_conn_timeout *timeout_ext; struct nf_ct_timeout *timeout; int l3num = ctx->family; - struct nf_conn *tmpl; __u8 l4num; int ret; @@ -863,28 +879,14 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, timeout->l3num = l3num; timeout->l4proto = l4proto; - tmpl = nf_ct_tmpl_alloc(ctx->net, zone, GFP_ATOMIC); - if (!tmpl) { - ret = -ENOMEM; - goto err_free_timeout; - } - - timeout_ext = nf_ct_timeout_ext_add(tmpl, timeout, GFP_ATOMIC); - if (!timeout_ext) { - ret = -ENOMEM; - goto err_free_tmpl; - } ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) - goto err_free_tmpl; - - priv->tmpl = tmpl; + goto err_free_timeout; + priv->timeout = timeout; return 0; -err_free_tmpl: - nf_ct_tmpl_free(tmpl); err_free_timeout: kfree(timeout); err_proto_put: @@ -896,22 +898,19 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); - struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); - struct nf_ct_timeout *timeout; + struct nf_ct_timeout *timeout = priv->timeout; - timeout = rcu_dereference_raw(t->timeout); nf_ct_untimeout(ctx->net, timeout); nf_ct_l4proto_put(timeout->l4proto); nf_ct_netns_put(ctx->net, ctx->family); - nf_ct_tmpl_free(priv->tmpl); + kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); - const struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); - const struct nf_ct_timeout *timeout = rcu_dereference_raw(t->timeout); + const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index 9f4151ec3e06..6c7aa6a0a0d2 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -16,6 +16,9 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CHECKSUM.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); MODULE_DESCRIPTION("Xtables: checksum modification"); @@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM"); static unsigned int checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) { - if (skb->ip_summed == CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb)) skb_checksum_help(skb); return XT_CONTINUE; @@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) static int checksum_tg_check(const struct xt_tgchk_param *par) { const struct xt_CHECKSUM_info *einfo = par->targinfo; + const struct ip6t_ip6 *i6 = par->entryinfo; + const struct ipt_ip *i4 = par->entryinfo; if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { pr_info_ratelimited("unsupported CHECKSUM operation %x\n", @@ -43,6 +48,21 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) if (!einfo->operation) return -EINVAL; + switch (par->family) { + case NFPROTO_IPV4: + if (i4->proto == IPPROTO_UDP && + (i4->invflags & XT_INV_PROTO) == 0) + return 0; + break; + case NFPROTO_IPV6: + if ((i6->flags & IP6T_F_PROTO) && + i6->proto == IPPROTO_UDP && + (i6->invflags & XT_INV_PROTO) == 0) + return 0; + break; + } + + pr_warn_once("CHECKSUM should be avoided. If really needed, restrict with \"-p udp\" and only use in OUTPUT\n"); return 0; } diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index dfbdbb2fc0ed..51d0c257e7a5 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -125,6 +125,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) { struct xt_cluster_match_info *info = par->matchinfo; + int ret; if (info->total_nodes > XT_CLUSTER_NODES_MAX) { pr_info_ratelimited("you have exceeded the maximum number of cluster nodes (%u > %u)\n", @@ -135,7 +136,17 @@ static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) pr_info_ratelimited("node mask cannot exceed total number of nodes\n"); return -EDOM; } - return 0; + + ret = nf_ct_netns_get(par->net, par->family); + if (ret < 0) + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_match xt_cluster_match __read_mostly = { @@ -144,6 +155,7 @@ static struct xt_match xt_cluster_match __read_mostly = { .match = xt_cluster_mt, .checkentry = xt_cluster_mt_checkentry, .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9b16402f29af..3e7d259e5d8d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); @@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) @@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v2(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; @@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v) static int dl_seq_show_v1(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; @@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private)); + struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index c070dfc0190a..c92894c3e40a 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -781,7 +781,8 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, { u32 addr_len; - if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { + if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && + info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b4a29bcc33b9..e3a0538ec0be 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid) if (nlk_sk(sk)->bound) goto err; - err = -ENOMEM; - if (BITS_PER_LONG > 32 && - unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) - goto err; - nlk_sk(sk)->portid = portid; sock_hold(sk); diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index ac8030c4bcf8..19cb2e473ea6 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index a66f102c6c01..4503937915ad 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -192,10 +192,8 @@ static void nci_uart_tty_close(struct tty_struct *tty) if (!nu) return; - if (nu->tx_skb) - kfree_skb(nu->tx_skb); - if (nu->rx_skb) - kfree_skb(nu->rx_skb); + kfree_skb(nu->tx_skb); + kfree_skb(nu->rx_skb); skb_queue_purge(&nu->tx_q); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 56b8e7167790..35966da84769 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -254,21 +254,18 @@ static bool icmphdr_ok(struct sk_buff *skb) static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { + unsigned short frag_off; + unsigned int payload_ofs = 0; unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; - int payload_ofs; struct ipv6hdr *nh; - uint8_t nexthdr; - __be16 frag_off; - int err; + int err, nexthdr, flags = 0; err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; nh = ipv6_hdr(skb); - nexthdr = nh->nexthdr; - payload_ofs = (u8 *)(nh + 1) - skb->data; key->ip.proto = NEXTHDR_NONE; key->ip.tos = ipv6_get_dsfield(nh); @@ -277,10 +274,9 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.src = nh->saddr; key->ipv6.addr.dst = nh->daddr; - payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - - if (frag_off) { - if (frag_off & htons(~0x7)) + nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); + if (flags & IP6_FH_F_FRAG) { + if (frag_off) key->ip.frag = OVS_FRAG_TYPE_LATER; else key->ip.frag = OVS_FRAG_TYPE_FIRST; @@ -288,11 +284,11 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } - /* Delayed handling of error in ipv6_skip_exthdr() as it - * always sets frag_off to a valid value which may be + /* Delayed handling of error in ipv6_find_hdr() as it + * always sets flags and frag_off to a valid value which may be * used to set key->ip.frag above. */ - if (unlikely(payload_ofs < 0)) + if (unlikely(nexthdr < 0)) return -EPROTO; nh_len = payload_ofs - nh_ofs; diff --git a/net/rds/bind.c b/net/rds/bind.c index 3ab55784b637..762d2c6788a3 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -76,11 +76,13 @@ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, struct rds_sock *rs; __rds_create_bind_key(key, addr, port, scope_id); - rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms); + rcu_read_lock(); + rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; + rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, ntohs(port)); @@ -235,6 +237,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } + sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) goto out; diff --git a/net/rds/ib.h b/net/rds/ib.h index 73427ff439f9..71ff356ee702 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -443,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ -DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) #define rds_ib_stats_add(member, count) \ rds_stats_add_which(rds_ib_stats, member, count) diff --git a/net/rds/recv.c b/net/rds/recv.c index 12719653188a..727639dac8a7 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,8 +43,6 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr) { - int i; - refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; @@ -52,8 +50,7 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_rdma_cookie = 0; inc->i_rx_tstamp = ktime_set(0, 0); - for (i = 0; i < RDS_RX_MAX_TRACES; i++) - inc->i_rx_lat_trace[i] = 0; + memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } EXPORT_SYMBOL_GPL(rds_inc_init); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdc199c6351..ee8e7e1d5c0f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -259,7 +259,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, while (list) { skb = list; list = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); rxrpc_free_skb(skb, rxrpc_skb_tx_freed); } } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 6f118d62c731..3c7c23421885 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -81,6 +81,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, static void free_tcf(struct tc_action *p) { free_percpu(p->cpu_bstats); + free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->act_cookie, NULL); @@ -246,6 +247,20 @@ nla_put_failure: goto done; } +static int tcf_idr_release_unsafe(struct tc_action *p) +{ + if (atomic_read(&p->tcfa_bindcnt) > 0) + return -EPERM; + + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + idr_remove(&p->idrinfo->action_idr, p->tcfa_index); + tcf_action_cleanup(p); + return ACT_P_DELETED; + } + + return 0; +} + static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops) { @@ -262,15 +277,19 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + spin_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, id) { - ret = __tcf_idr_release(p, false, true); + ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); n_i++; } else if (ret < 0) { + spin_unlock(&idrinfo->lock); goto nla_put_failure; } } + spin_unlock(&idrinfo->lock); + if (nla_put_u32(skb, TCA_FCNT, n_i)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -372,9 +391,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); if (!p->cpu_bstats) goto err1; + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats_hw) + goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) - goto err2; + goto err3; } spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; @@ -386,15 +408,17 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err3; + goto err4; } p->idrinfo = idrinfo; p->ops = ops; *a = p; return 0; -err3: +err4: free_percpu(p->cpu_qstats); +err3: + free_percpu(p->cpu_bstats_hw); err2: free_percpu(p->cpu_bstats); err1: @@ -979,6 +1003,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, goto errout; if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || + gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index aa44d14b43c7..c89a7fa43d1b 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -157,7 +157,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, } static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse) + u64 lastuse, bool hw) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); @@ -168,6 +168,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; + if (hw) + _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), + bytes, packets); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index a9d64bfe5a2a..1dae5f2b358f 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -283,12 +283,15 @@ out: } static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse) + u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d98f33fdffe2..c5c1e23add77 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -256,28 +256,31 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, unsigned char *b = skb_tail_pointer(skb); struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { - .old_addr = p->old_addr, - .new_addr = p->new_addr, - .mask = p->mask, - .flags = p->flags, - .index = p->tcf_index, - .action = p->tcf_action, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock_bh(&p->tcf_lock); + opt.old_addr = p->old_addr; + opt.new_addr = p->new_addr; + opt.mask = p->mask; + opt.flags = p->flags; + opt.action = p->tcf_action; + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; + spin_unlock_bh(&p->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 393c7a670300..92649d2667ed 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,8 +22,7 @@ #include <net/act_api.h> #include <net/netlink.h> -struct tcf_police { - struct tc_action common; +struct tcf_police_params { int tcfp_result; u32 tcfp_ewma_rate; s64 tcfp_burst; @@ -36,6 +35,12 @@ struct tcf_police { bool rate_present; struct psched_ratecfg peak; bool peak_present; + struct rcu_head rcu; +}; + +struct tcf_police { + struct tc_action common; + struct tcf_police_params __rcu *params; }; #define to_police(pc) ((struct tcf_police *)pc) @@ -84,6 +89,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_police_params *new; bool exists = false; int size; @@ -110,7 +116,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, - &act_police_ops, bind, false); + &act_police_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; @@ -137,7 +143,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (est) { - err = gen_replace_estimator(&police->tcf_bstats, NULL, + err = gen_replace_estimator(&police->tcf_bstats, + police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, NULL, est); @@ -150,50 +157,60 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } - spin_lock_bh(&police->tcf_lock); + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (unlikely(!new)) { + err = -ENOMEM; + goto failure; + } + /* No failure allowed after this point */ - police->tcfp_mtu = parm->mtu; - if (police->tcfp_mtu == 0) { - police->tcfp_mtu = ~0; + new->tcfp_mtu = parm->mtu; + if (!new->tcfp_mtu) { + new->tcfp_mtu = ~0; if (R_tab) - police->tcfp_mtu = 255 << R_tab->rate.cell_log; + new->tcfp_mtu = 255 << R_tab->rate.cell_log; } if (R_tab) { - police->rate_present = true; - psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); + new->rate_present = true; + psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0); qdisc_put_rtab(R_tab); } else { - police->rate_present = false; + new->rate_present = false; } if (P_tab) { - police->peak_present = true; - psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); + new->peak_present = true; + psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0); qdisc_put_rtab(P_tab); } else { - police->peak_present = false; + new->peak_present = false; } if (tb[TCA_POLICE_RESULT]) - police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); - police->tcfp_toks = police->tcfp_burst; - if (police->peak_present) { - police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, - police->tcfp_mtu); - police->tcfp_ptoks = police->tcfp_mtu_ptoks; + new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + new->tcfp_toks = new->tcfp_burst; + if (new->peak_present) { + new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, + new->tcfp_mtu); + new->tcfp_ptoks = new->tcfp_mtu_ptoks; } - police->tcf_action = parm->action; if (tb[TCA_POLICE_AVRATE]) - police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + spin_lock_bh(&police->tcf_lock); + new->tcfp_t_c = ktime_get_ns(); + police->tcf_action = parm->action; + rcu_swap_protected(police->params, + new, + lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); - if (ret != ACT_P_CREATED) - return ret; - police->tcfp_t_c = ktime_get_ns(); - tcf_idr_insert(tn, *a); + if (new) + kfree_rcu(new, rcu); + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); return ret; failure: @@ -207,64 +224,69 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); - s64 now; - s64 toks; - s64 ptoks = 0; - - spin_lock(&police->tcf_lock); + struct tcf_police_params *p; + s64 now, toks, ptoks = 0; + int ret; - bstats_update(&police->tcf_bstats, skb); tcf_lastuse_update(&police->tcf_tm); + bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); - if (police->tcfp_ewma_rate) { + ret = READ_ONCE(police->tcf_action); + p = rcu_dereference_bh(police->params); + + if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; if (!gen_estimator_read(&police->tcf_rate_est, &sample) || - sample.bps >= police->tcfp_ewma_rate) { - police->tcf_qstats.overlimits++; - if (police->tcf_action == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcf_action; - } + sample.bps >= p->tcfp_ewma_rate) + goto inc_overlimits; } - if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { - if (!police->rate_present) { - spin_unlock(&police->tcf_lock); - return police->tcfp_result; + if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { + if (!p->rate_present) { + ret = p->tcfp_result; + goto end; } now = ktime_get_ns(); - toks = min_t(s64, now - police->tcfp_t_c, - police->tcfp_burst); - if (police->peak_present) { - ptoks = toks + police->tcfp_ptoks; - if (ptoks > police->tcfp_mtu_ptoks) - ptoks = police->tcfp_mtu_ptoks; - ptoks -= (s64) psched_l2t_ns(&police->peak, - qdisc_pkt_len(skb)); + toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst); + if (p->peak_present) { + ptoks = toks + p->tcfp_ptoks; + if (ptoks > p->tcfp_mtu_ptoks) + ptoks = p->tcfp_mtu_ptoks; + ptoks -= (s64)psched_l2t_ns(&p->peak, + qdisc_pkt_len(skb)); } - toks += police->tcfp_toks; - if (toks > police->tcfp_burst) - toks = police->tcfp_burst; - toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); + toks += p->tcfp_toks; + if (toks > p->tcfp_burst) + toks = p->tcfp_burst; + toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - police->tcfp_t_c = now; - police->tcfp_toks = toks; - police->tcfp_ptoks = ptoks; - if (police->tcfp_result == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcfp_result; + p->tcfp_t_c = now; + p->tcfp_toks = toks; + p->tcfp_ptoks = ptoks; + ret = p->tcfp_result; + goto inc_drops; } } - police->tcf_qstats.overlimits++; - if (police->tcf_action == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcf_action; +inc_overlimits: + qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats)); +inc_drops: + if (ret == TC_ACT_SHOT) + qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats)); +end: + return ret; +} + +static void tcf_police_cleanup(struct tc_action *a) +{ + struct tcf_police *police = to_police(a); + struct tcf_police_params *p; + + p = rcu_dereference_protected(police->params, 1); + if (p) + kfree_rcu(p, rcu); } static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, @@ -272,6 +294,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); + struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -281,19 +304,21 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, spin_lock_bh(&police->tcf_lock); opt.action = police->tcf_action; - opt.mtu = police->tcfp_mtu; - opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); - if (police->rate_present) - psched_ratecfg_getrate(&opt.rate, &police->rate); - if (police->peak_present) - psched_ratecfg_getrate(&opt.peakrate, &police->peak); + p = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + opt.mtu = p->tcfp_mtu; + opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); + if (p->rate_present) + psched_ratecfg_getrate(&opt.rate, &p->rate); + if (p->peak_present) + psched_ratecfg_getrate(&opt.peakrate, &p->peak); if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; - if (police->tcfp_result && - nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) + if (p->tcfp_result && + nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result)) goto nla_put_failure; - if (police->tcfp_ewma_rate && - nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) + if (p->tcfp_ewma_rate && + nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); @@ -332,6 +357,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_police_init, .walk = tcf_police_walker, .lookup = tcf_police_search, + .cleanup = tcf_police_cleanup, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 83a133375d6d..1a0c682fd734 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_sample_ops, bind, false); + &act_sample_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b6263704ea57..64dba3708fce 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -99,7 +99,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - struct tcf_skbedit_params *params_old, *params_new; + struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -187,8 +187,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } } - ASSERT_RTNL(); - params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { if (ret == ACT_P_CREATED) @@ -210,11 +208,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + spin_lock_bh(&d->tcf_lock); d->tcf_action = parm->action; - params_old = rtnl_dereference(d->params); - rcu_assign_pointer(d->params, params_new); - if (params_old) - kfree_rcu(params_old, rcu); + rcu_swap_protected(d->params, params_new, + lockdep_is_held(&d->tcf_lock)); + spin_unlock_bh(&d->tcf_lock); + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -231,12 +231,14 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; u64 pure_flags = 0; struct tcf_t t; - params = rtnl_dereference(d->params); + spin_lock_bh(&d->tcf_lock); + params = rcu_dereference_protected(d->params, + lockdep_is_held(&d->tcf_lock)); + opt.action = d->tcf_action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -264,9 +266,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 6d95b6919d9d..4cca8f274662 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -317,7 +317,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &metadata->u.tun_info, opts_len, extack); if (ret < 0) - goto err_out; + goto release_tun_meta; } metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; @@ -333,23 +333,24 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &act_tunnel_key_ops, bind, true); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); - goto err_out; + goto release_tun_meta; } ret = ACT_P_CREATED; } else if (!ovr) { - tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "TC IDR already exists"); - return -EEXIST; + ret = -EEXIST; + goto release_tun_meta; } t = to_tunnel_key(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); - return -ENOMEM; + ret = -ENOMEM; + exists = true; + goto release_tun_meta; } params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; @@ -367,6 +368,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return ret; +release_tun_meta: + dst_release(&metadata->dst); + err_out: if (exists) tcf_idr_release(*a, bind); @@ -408,8 +412,10 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, opt->type) || nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, - opt->length * 4, opt + 1)) + opt->length * 4, opt + 1)) { + nla_nest_cancel(skb, start); return -EMSGSIZE; + } len -= sizeof(struct geneve_opt) + opt->length * 4; src += sizeof(struct geneve_opt) + opt->length * 4; @@ -423,7 +429,7 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct nlattr *start; - int err; + int err = -EINVAL; if (!info->options_len) return 0; @@ -435,9 +441,11 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) - return err; + goto err_out; } else { - return -EINVAL; +err_out: + nla_nest_cancel(skb, start); + return err; } nla_nest_end(skb, start); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1a67af8a6e8c..3de47e99b788 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -240,8 +240,8 @@ static void tcf_chain_destroy(struct tcf_chain *chain) if (!chain->index) block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list) && block->refcnt == 0) - kfree(block); + if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt)) + kfree_rcu(block, rcu); } static void tcf_chain_hold(struct tcf_chain *chain) @@ -473,6 +473,7 @@ tcf_chain0_head_change_cb_del(struct tcf_block *block, } struct tcf_net { + spinlock_t idr_lock; /* Protects idr */ struct idr idr; }; @@ -482,16 +483,25 @@ static int tcf_block_insert(struct tcf_block *block, struct net *net, struct netlink_ext_ack *extack) { struct tcf_net *tn = net_generic(net, tcf_net_id); + int err; + + idr_preload(GFP_KERNEL); + spin_lock(&tn->idr_lock); + err = idr_alloc_u32(&tn->idr, block, &block->index, block->index, + GFP_NOWAIT); + spin_unlock(&tn->idr_lock); + idr_preload_end(); - return idr_alloc_u32(&tn->idr, block, &block->index, block->index, - GFP_KERNEL); + return err; } static void tcf_block_remove(struct tcf_block *block, struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); + spin_lock(&tn->idr_lock); idr_remove(&tn->idr, block->index); + spin_unlock(&tn->idr_lock); } static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, @@ -510,7 +520,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->owner_list); INIT_LIST_HEAD(&block->chain0.filter_chain_list); - block->refcnt = 1; + refcount_set(&block->refcnt, 1); block->net = net; block->index = block_index; @@ -527,6 +537,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) return idr_find(&tn->idr, block_index); } +static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) +{ + struct tcf_block *block; + + rcu_read_lock(); + block = tcf_block_lookup(net, block_index); + if (block && !refcount_inc_not_zero(&block->refcnt)) + block = NULL; + rcu_read_unlock(); + + return block; +} + +static void tcf_block_flush_all_chains(struct tcf_block *block) +{ + struct tcf_chain *chain; + + /* Hold a refcnt for all chains, so that they don't disappear + * while we are iterating. + */ + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_hold(chain); + + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_flush(chain); +} + +static void tcf_block_put_all_chains(struct tcf_block *block) +{ + struct tcf_chain *chain, *tmp; + + /* At this point, all the chains should have refcnt >= 1. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); + tcf_chain_put(chain); + } +} + +static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + if (refcount_dec_and_test(&block->refcnt)) { + /* Flushing/putting all chains will cause the block to be + * deallocated when last chain is freed. However, if chain_list + * is empty, block has to be manually deallocated. After block + * reference counter reached 0, it is no longer possible to + * increment it or add new chains to block. + */ + bool free_block = list_empty(&block->chain_list); + + if (tcf_block_shared(block)) + tcf_block_remove(block, block->net); + if (!free_block) + tcf_block_flush_all_chains(block); + + if (q) + tcf_block_offload_unbind(block, q, ei); + + if (free_block) + kfree_rcu(block, rcu); + else + tcf_block_put_all_chains(block); + } else if (q) { + tcf_block_offload_unbind(block, q, ei); + } +} + +static void tcf_block_refcnt_put(struct tcf_block *block) +{ + __tcf_block_put(block, NULL, NULL); +} + /* Find tcf block. * Set q, parent, cl when appropriate. */ @@ -537,9 +619,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, struct netlink_ext_ack *extack) { struct tcf_block *block; + int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, block_index); + block = tcf_block_refcnt_get(net, block_index); if (!block) { NL_SET_ERR_MSG(extack, "Block of given index was not found"); return ERR_PTR(-EINVAL); @@ -548,55 +631,104 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, const struct Qdisc_class_ops *cops; struct net_device *dev; + rcu_read_lock(); + /* Find link */ - dev = __dev_get_by_index(net, ifindex); - if (!dev) + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); return ERR_PTR(-ENODEV); + } /* Find qdisc */ if (!*parent) { *q = dev->qdisc; *parent = (*q)->handle; } else { - *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); + *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } } + *q = qdisc_refcount_inc_nz(*q); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + err = -EINVAL; + goto errout_rcu; + } + /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_rcu; } + /* At this point we know that qdisc is not noop_qdisc, + * which means that qdisc holds a reference to net_device + * and we hold a reference to qdisc, so it is safe to release + * rcu read lock. + */ + rcu_read_unlock(); + /* Do we search for filter, attached to class? */ if (TC_H_MIN(*parent)) { *cl = cops->find(*q, *parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto errout_qdisc; } } /* And the last stroke */ block = cops->tcf_block(*q, *cl, extack); - if (!block) - return ERR_PTR(-EINVAL); + if (!block) { + err = -EINVAL; + goto errout_qdisc; + } if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_qdisc; } + + /* Always take reference to block in order to support execution + * of rules update path of cls API without rtnl lock. Caller + * must release block when it is finished using it. 'if' block + * of this conditional obtain reference to block by calling + * tcf_block_refcnt_get(). + */ + refcount_inc(&block->refcnt); } return block; + +errout_rcu: + rcu_read_unlock(); +errout_qdisc: + if (*q) + qdisc_put(*q); + return ERR_PTR(err); +} + +static void tcf_block_release(struct Qdisc *q, struct tcf_block *block) +{ + if (!IS_ERR_OR_NULL(block)) + tcf_block_refcnt_put(block); + + if (q) + qdisc_put(q); } struct tcf_block_owner_item { @@ -664,21 +796,16 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, { struct net *net = qdisc_net(q); struct tcf_block *block = NULL; - bool created = false; int err; - if (ei->block_index) { + if (ei->block_index) /* block_index not 0 means the shared block is requested */ - block = tcf_block_lookup(net, ei->block_index); - if (block) - block->refcnt++; - } + block = tcf_block_refcnt_get(net, ei->block_index); if (!block) { block = tcf_block_create(net, q, ei->block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); - created = true; if (tcf_block_shared(block)) { err = tcf_block_insert(block, net, extack); if (err) @@ -708,14 +835,8 @@ err_block_offload_bind: err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: - if (created) { - if (tcf_block_shared(block)) - tcf_block_remove(block, net); err_block_insert: - kfree(block); - } else { - block->refcnt--; - } + tcf_block_refcnt_put(block); return err; } EXPORT_SYMBOL(tcf_block_get_ext); @@ -747,42 +868,12 @@ EXPORT_SYMBOL(tcf_block_get); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { - struct tcf_chain *chain, *tmp; - if (!block) return; tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (block->refcnt == 1) { - if (tcf_block_shared(block)) - tcf_block_remove(block, block->net); - - /* Hold a refcnt for all chains, so that they don't disappear - * while we are iterating. - */ - list_for_each_entry(chain, &block->chain_list, list) - tcf_chain_hold(chain); - - list_for_each_entry(chain, &block->chain_list, list) - tcf_chain_flush(chain); - } - - tcf_block_offload_unbind(block, q, ei); - - if (block->refcnt == 1) { - /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { - tcf_chain_put_explicitly_created(chain); - tcf_chain_put(chain); - } - - block->refcnt--; - if (list_empty(&block->chain_list)) - kfree(block); - } else { - block->refcnt--; - } + __tcf_block_put(block, q, ei); } EXPORT_SYMBOL(tcf_block_put_ext); @@ -1332,6 +1423,7 @@ replay: errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1453,6 +1545,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1538,6 +1631,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1636,7 +1730,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, tcm->tcm_block_index); + block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value @@ -1695,6 +1789,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) } } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) + tcf_block_refcnt_put(block); cb->args[0] = index; out: @@ -1854,7 +1950,8 @@ replay: chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { @@ -1866,23 +1963,27 @@ replay: tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; + err = -EEXIST; + goto errout_block; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; + err = -ENOENT; + goto errout_block; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + err = -ENOMEM; + goto errout_block; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } tcf_chain_hold(chain); } @@ -1902,6 +2003,8 @@ replay: RTM_NEWCHAIN, false); break; case RTM_DELCHAIN: + tfilter_notify_chain(net, skb, block, q, parent, n, + chain, RTM_DELTFILTER); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain); /* In case the chain was successfully deleted, put a reference @@ -1924,6 +2027,8 @@ replay: errout: tcf_chain_put(chain); +errout_block: + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1952,7 +2057,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, tcm->tcm_block_index); + block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value @@ -2019,6 +2124,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) + tcf_block_refcnt_put(block); cb->args[0] = index; out: @@ -2211,6 +2318,7 @@ static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); + spin_lock_init(&tn->idr_lock); idr_init(&tn->idr); return 0; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6fd9bdd93796..92dd5071a708 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -79,6 +79,7 @@ struct fl_flow_tmplt { struct fl_flow_key mask; struct flow_dissector dissector; struct tcf_chain *chain; + struct rcu_work rwork; }; struct cls_fl_head { @@ -98,7 +99,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - unsigned int in_hw_count; + u32 in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -993,7 +994,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) } #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) -#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member) #define FL_KEY_IS_MASKED(mask, member) \ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ @@ -1437,14 +1438,22 @@ errout_tb: return ERR_PTR(err); } -static void fl_tmplt_destroy(void *tmplt_priv) +static void fl_tmplt_destroy_work(struct work_struct *work) { - struct fl_flow_tmplt *tmplt = tmplt_priv; + struct fl_flow_tmplt *tmplt = container_of(to_rcu_work(work), + struct fl_flow_tmplt, rwork); fl_hw_destroy_tmplt(tmplt->chain, tmplt); kfree(tmplt); } +static void fl_tmplt_destroy(void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + + tcf_queue_work(&tmplt->rwork, fl_tmplt_destroy_work); +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1880,6 +1889,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..22e9799e5b69 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -27,7 +27,6 @@ #include <linux/kmod.h> #include <linux/list.h> #include <linux/hrtimer.h> -#include <linux/lockdep.h> #include <linux/slab.h> #include <linux/hashtable.h> @@ -315,6 +314,24 @@ out: return q; } +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) +{ + struct netdev_queue *nq; + struct Qdisc *q; + + if (!handle) + return NULL; + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + nq = dev_ingress_queue_rcu(dev); + if (nq) + q = qdisc_match_from_root(nq->qdisc_sleeping, handle); +out: + return q; +} + static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; @@ -921,7 +938,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, qdisc_notify(net, skb, n, clid, old, new); if (old) - qdisc_destroy(old); + qdisc_put(old); } /* Graft qdisc "new" to class "classid" of qdisc "parent" or @@ -974,7 +991,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_refcount_inc(new); if (!ingress) - qdisc_destroy(old); + qdisc_put(old); } skip: @@ -1053,10 +1070,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, return 0; } -/* lockdep annotation is needed for ingress; egress gets it only for name */ -static struct lock_class_key qdisc_tx_lock; -static struct lock_class_key qdisc_rx_lock; - /* Allocate and initialize new qdisc. @@ -1121,7 +1134,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; handle = TC_H_MAKE(TC_H_INGRESS, 0); - lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); @@ -1129,7 +1141,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == 0) goto err_out3; } - lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } @@ -1568,7 +1579,7 @@ graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) - qdisc_destroy(q); + qdisc_put(q); return err; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index cd49afca9617..d714d3747bcb 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) pr_debug("atm_tc_put: destroying\n"); list_del_init(&flow->list); pr_debug("atm_tc_put: qdisc %p\n", flow->q); - qdisc_destroy(flow->q); + qdisc_put(flow->q); tcf_block_put(flow->block); if (flow->sock) { pr_debug("atm_tc_put: f_count %ld\n", diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index c07c30b916d5..dc539295ae65 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -812,7 +812,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) if (skb) { flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } return skb; @@ -1252,7 +1252,7 @@ found: else flow->head = elig_ack->next; - elig_ack->next = NULL; + skb_mark_not_on_list(elig_ack); return elig_ack; } @@ -1675,7 +1675,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, while (segs) { nskb = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index f42025d53cfe..4dc05409e3fb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) WARN_ON(cl->filters); tcf_block_put(cl->block); - qdisc_destroy(cl->q); + qdisc_put(cl->q); qdisc_put_rtab(cl->R_tab); gen_kill_estimator(&cl->rate_est); if (cl != &q->link) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index e26a24017faa..e689e11b6d0f 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -379,7 +379,7 @@ static void cbs_destroy(struct Qdisc *sch) cbs_disable_offload(dev, q); if (q->qdisc) - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e0b0cf8a9939..cdebaed0f8cf 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); return err; } @@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { gen_kill_estimator(&cl->rate_est); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 049714c57075..f6f480784bc6 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); tcf_block_put(p->block); - qdisc_destroy(p->q); + qdisc_put(p->q); if (p->mv != p->embedded) kfree(p->mv); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 24893d3b5d22..3809c9bf8896 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, if (q) { err = fifo_set_limit(q, limit); if (err < 0) { - qdisc_destroy(q); + qdisc_put(q); q = NULL; } } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4808713c73b9..628a2cdcfc6f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -106,7 +106,6 @@ struct fq_sched_data { u64 stat_gc_flows; u64 stat_internal_packets; - u64 stat_tcp_retrans; u64 stat_throttled; u64 stat_flows_plimit; u64 stat_pkts_too_long; @@ -319,7 +318,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) if (skb) { flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) return skb; } -/* We might add in the future detection of retransmits - * For the time being, just return false - */ -static bool skb_is_retransmit(struct sk_buff *skb) -{ - return false; -} - -/* add skb to flow queue - * flow queue is a linked list, kind of FIFO, except for TCP retransmits - * We special case tcp retransmits to be transmitted before other packets. - * We rely on fact that TCP retransmits are unlikely, so we do not waste - * a separate queue or a pointer. - * head-> [retrans pkt 1] - * [retrans pkt 2] - * [ normal pkt 1] - * [ normal pkt 2] - * [ normal pkt 3] - * tail-> [ normal pkt 4] - */ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) { - struct sk_buff *prev, *head = flow->head; + struct sk_buff *head = flow->head; skb->next = NULL; - if (!head) { + if (!head) flow->head = skb; - flow->tail = skb; - return; - } - if (likely(!skb_is_retransmit(skb))) { + else flow->tail->next = skb; - flow->tail = skb; - return; - } - /* This skb is a tcp retransmit, - * find the last retrans packet in the queue - */ - prev = NULL; - while (skb_is_retransmit(head)) { - prev = head; - head = head->next; - if (!head) - break; - } - if (!prev) { /* no rtx packet in queue, become the new head */ - skb->next = flow->head; - flow->head = skb; - } else { - if (prev == flow->tail) - flow->tail = skb; - else - skb->next = prev->next; - prev->next = skb; - } + flow->tail = skb; } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } f->qlen++; - if (skb_is_retransmit(skb)) - q->stat_tcp_retrans++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { struct sock *sk = skb->sk; @@ -460,7 +412,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); + u64 now = ktime_get_tai_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; @@ -491,11 +443,16 @@ begin: } skb = f->head; - if (unlikely(skb && now < f->time_next_packet && - !skb_is_tcp_pure_ack(skb))) { - head->first = f->next; - fq_flow_set_throttled(q, f); - goto begin; + if (skb && !skb_is_tcp_pure_ack(skb)) { + u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), + f->time_next_packet); + + if (now < time_next_packet) { + head->first = f->next; + f->time_next_packet = time_next_packet; + fq_flow_set_throttled(q, f); + goto begin; + } } skb = fq_dequeue_head(sch, f); @@ -513,11 +470,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (!q->rate_enable) - goto out; - - /* Do not pace locally generated ack packets */ - if (skb_is_tcp_pure_ack(skb)) + if (ktime_to_ns(skb->tstamp) || !q->rate_enable) goto out; rate = q->flow_max_rate; @@ -823,7 +776,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; - qdisc_watchdog_init(&q->watchdog, sch); + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_TAI); if (opt) err = fq_change(sch, opt, extack); @@ -873,12 +826,12 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.gc_flows = q->stat_gc_flows; st.highprio_packets = q->stat_internal_packets; - st.tcp_retrans = q->stat_tcp_retrans; + st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; st.pkts_too_long = q->stat_pkts_too_long; st.allocation_errors = q->stat_allocation_errors; - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_tai_ns(); st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6c0a9d5dbf94..cd04d40c30b6 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) struct sk_buff *skb = flow->head; flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); return skb; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 69078c82963e..531fac1d2875 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, skb = nskb; (*packets)++; /* GSO counts as one pkt */ } - skb->next = NULL; + skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure @@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, skb = nskb; } while (++cnt < 8); (*packets) += cnt; - skb->next = NULL; + skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). @@ -901,7 +901,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, if (!ops->init || ops->init(sch, NULL, extack) == 0) return sch; - qdisc_destroy(sch); + qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); @@ -941,15 +941,18 @@ void qdisc_free(struct Qdisc *qdisc) kfree((char *) qdisc - qdisc->padded); } -void qdisc_destroy(struct Qdisc *qdisc) +void qdisc_free_cb(struct rcu_head *head) +{ + struct Qdisc *q = container_of(head, struct Qdisc, rcu); + + qdisc_free(q); +} + +static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; - if (qdisc->flags & TCQ_F_BUILTIN || - !refcount_dec_and_test(&qdisc->refcnt)) - return; - #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -974,9 +977,34 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(skb); } - qdisc_free(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); +} + +void qdisc_put(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN || + !refcount_dec_and_test(&qdisc->refcnt)) + return; + + qdisc_destroy(qdisc); +} +EXPORT_SYMBOL(qdisc_put); + +/* Version of qdisc_put() that is called with rtnl mutex unlocked. + * Intended to be used as optimization, this function only takes rtnl lock if + * qdisc reference counter reached zero. + */ + +void qdisc_put_unlocked(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN || + !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) + return; + + qdisc_destroy(qdisc); + rtnl_unlock(); } -EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, @@ -1270,7 +1298,7 @@ static void shutdown_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc_default); dev_queue->qdisc_sleeping = qdisc_default; - qdisc_destroy(qdisc); + qdisc_put(qdisc); } } @@ -1279,7 +1307,7 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_destroy(dev->qdisc); + qdisc_put(dev->qdisc); dev->qdisc = &noop_qdisc; WARN_ON(timer_pending(&dev->watchdog_timer)); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3278a76f6861..b18ec1f6de60 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) struct hfsc_sched *q = qdisc_priv(sch); tcf_block_put(cl->block); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); gen_kill_estimator(&cl->rate_est); if (cl != &q->root) kfree(cl); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c3a8388dcdf6..9d6a47697406 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) struct sk_buff *skb = bucket->head; bucket->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); return skb; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 43c4bfe625a9..58b449490757 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -132,7 +132,7 @@ struct htb_class { struct htb_class_inner { struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; - } un; + }; s64 pq_key; int prio_activity; /* for which prios are we active */ @@ -411,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].feed.rb_node) + if (p->inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); + htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -447,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].ptr == cl->node + prio) { + if (p->inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; - p->un.inner.clprio[prio].ptr = NULL; + p->inner.clprio[prio].last_ptr_id = cl->common.classid; + p->inner.clprio[prio].ptr = NULL; } htb_safe_rb_erase(cl->node + prio, - &p->un.inner.clprio[prio].feed); + &p->inner.clprio[prio].feed); - if (!p->un.inner.clprio[prio].feed.rb_node) + if (!p->inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -555,7 +555,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; @@ -577,22 +577,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) cl->prio_activity = 0; } -static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, - struct qdisc_skb_head *qh) -{ - struct sk_buff *last = qh->tail; - - if (last) { - skb->next = NULL; - last->next = skb; - qh->tail = skb; - } else { - qh->tail = skb; - qh->head = skb; - } - qh->qlen++; -} - static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -603,7 +587,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == HTB_DIRECT) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen) { - htb_enqueue_tail(skb, sch, &q->direct_queue); + __qdisc_enqueue_tail(skb, &q->direct_queue); q->direct_pkts++; } else { return qdisc_drop(skb, sch, to_free); @@ -615,7 +599,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + } else if ((ret = qdisc_enqueue(skb, cl->leaf.q, to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); @@ -823,7 +807,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - clp = &cl->un.inner.clprio[prio]; + clp = &cl->inner.clprio[prio]; (++sp)->root = clp->feed.rb_node; sp->pptr = &clp->ptr; sp->pid = &clp->last_ptr_id; @@ -857,7 +841,7 @@ next: * graft operation on the leaf since last dequeue; * simply deactivate and skip such class */ - if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + if (unlikely(cl->leaf.q->q.qlen == 0)) { struct htb_class *next; htb_deactivate(q, cl); @@ -873,12 +857,12 @@ next: goto next; } - skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); + skb = cl->leaf.q->dequeue(cl->leaf.q); if (likely(skb != NULL)) break; - qdisc_warn_nonwc("htb", cl->un.leaf.q); - htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + qdisc_warn_nonwc("htb", cl->leaf.q); + htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr: &q->hlevel[0].hprio[prio].ptr); cl = htb_lookup_leaf(hprio, prio); @@ -886,16 +870,16 @@ next: if (likely(skb != NULL)) { bstats_update(&cl->bstats, skb); - cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); - if (cl->un.leaf.deficit[level] < 0) { - cl->un.leaf.deficit[level] += cl->quantum; - htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + cl->leaf.deficit[level] -= qdisc_pkt_len(skb); + if (cl->leaf.deficit[level] < 0) { + cl->leaf.deficit[level] += cl->quantum; + htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr : &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation * gives us slightly better performance */ - if (!cl->un.leaf.q->q.qlen) + if (!cl->leaf.q->q.qlen) htb_deactivate(q, cl); htb_charge_class(q, cl, level, skb); } @@ -972,10 +956,10 @@ static void htb_reset(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) - memset(&cl->un.inner, 0, sizeof(cl->un.inner)); + memset(&cl->inner, 0, sizeof(cl->inner)); else { - if (cl->un.leaf.q) - qdisc_reset(cl->un.leaf.q); + if (cl->leaf.q) + qdisc_reset(cl->leaf.q); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -1098,8 +1082,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, */ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; tcm->tcm_handle = cl->common.classid; - if (!cl->level && cl->un.leaf.q) - tcm->tcm_info = cl->un.leaf.q->handle; + if (!cl->level && cl->leaf.q) + tcm->tcm_info = cl->leaf.q->handle; nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -1142,9 +1126,9 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) }; __u32 qlen = 0; - if (!cl->level && cl->un.leaf.q) { - qlen = cl->un.leaf.q->q.qlen; - qs.backlog = cl->un.leaf.q->qstats.backlog; + if (!cl->level && cl->leaf.q) { + qlen = cl->leaf.q->q.qlen; + qs.backlog = cl->leaf.q->qstats.backlog; } cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), INT_MIN, INT_MAX); @@ -1172,14 +1156,14 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, cl->common.classid, extack)) == NULL) return -ENOBUFS; - *old = qdisc_replace(sch, new, &cl->un.leaf.q); + *old = qdisc_replace(sch, new, &cl->leaf.q); return 0; } static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - return !cl->level ? cl->un.leaf.q : NULL; + return !cl->level ? cl->leaf.q : NULL; } static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -1205,15 +1189,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, { struct htb_class *parent = cl->parent; - WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); + WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity); if (parent->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&parent->pq_node, &q->hlevel[parent->level].wait_pq); parent->level = 0; - memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - parent->un.leaf.q = new_q ? new_q : &noop_qdisc; + memset(&parent->inner, 0, sizeof(parent->inner)); + parent->leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; parent->t_c = ktime_get_ns(); @@ -1223,8 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) { if (!cl->level) { - WARN_ON(!cl->un.leaf.q); - qdisc_destroy(cl->un.leaf.q); + WARN_ON(!cl->leaf.q); + qdisc_put(cl->leaf.q); } gen_kill_estimator(&cl->rate_est); tcf_block_put(cl->block); @@ -1286,11 +1270,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); if (!cl->level) { - unsigned int qlen = cl->un.leaf.q->q.qlen; - unsigned int backlog = cl->un.leaf.q->qstats.backlog; + unsigned int qlen = cl->leaf.q->q.qlen; + unsigned int backlog = cl->leaf.q->qstats.backlog; - qdisc_reset(cl->un.leaf.q); - qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); + qdisc_reset(cl->leaf.q); + qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog); } /* delete from hash and active; remainder in destroy_class */ @@ -1419,13 +1403,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, classid, NULL); sch_tree_lock(sch); if (parent && !parent->level) { - unsigned int qlen = parent->un.leaf.q->q.qlen; - unsigned int backlog = parent->un.leaf.q->qstats.backlog; + unsigned int qlen = parent->leaf.q->q.qlen; + unsigned int backlog = parent->leaf.q->qstats.backlog; /* turn parent into inner node */ - qdisc_reset(parent->un.leaf.q); - qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); - qdisc_destroy(parent->un.leaf.q); + qdisc_reset(parent->leaf.q); + qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog); + qdisc_put(parent->leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); @@ -1436,10 +1420,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } parent->level = (parent->parent ? parent->parent->level : TC_HTB_MAXDEPTH) - 1; - memset(&parent->un.inner, 0, sizeof(parent->un.inner)); + memset(&parent->inner, 0, sizeof(parent->inner)); } /* leaf (we) needs elementary qdisc */ - cl->un.leaf.q = new_q ? new_q : &noop_qdisc; + cl->leaf.q = new_q ? new_q : &noop_qdisc; cl->common.classid = classid; cl->parent = parent; @@ -1455,8 +1439,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_class_hash_insert(&q->clhash, &cl->common); if (parent) parent->children++; - if (cl->un.leaf.q != &noop_qdisc) - qdisc_hash_add(cl->un.leaf.q, true); + if (cl->leaf.q != &noop_qdisc) + qdisc_hash_add(cl->leaf.q, true); } else { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, @@ -1478,7 +1462,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); /* it used to be a nasty bug here, we have to check that node - * is really leaf before changing cl->un.leaf ! + * is really leaf before changing cl->leaf ! */ if (!cl->level) { u64 quantum = cl->rate.rate_bytes_ps; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index d6b8ae4ed7a3..f20f3a0f8424 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch) if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) - qdisc_destroy(priv->qdiscs[ntx]); + qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } @@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch) qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) - qdisc_destroy(old); + qdisc_put(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 0e9d761cdd80..d364e63c396d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) - qdisc_destroy(priv->qdiscs[ntx]); + qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } @@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch) qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) - qdisc_destroy(old); + qdisc_put(old); if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 1da7ea8de0ad..7410ce4d0321 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch) tcf_block_put(q->block); for (band = 0; band < q->bands; band++) - qdisc_destroy(q->queues[band]); + qdisc_put(q->queues[band]); kfree(q->queues); } @@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, q->queues[i] = &noop_qdisc; qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_destroy(child); + qdisc_put(child); } } @@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); - qdisc_destroy(old); + qdisc_put(old); } sch_tree_unlock(sch); } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ad18a2052416..57b3ad9394ad 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -412,16 +412,6 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, return segs; } -static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) -{ - skb->next = qh->head; - - if (!qh->head) - qh->tail = skb; - qh->head = skb; - qh->qlen++; -} - /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -570,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb->time_to_send = ktime_get_ns(); q->counter = 0; - netem_enqueue_skb_head(&sch->q, skb); + __qdisc_enqueue_head(skb, &sch->q); sch->qstats.requeues++; } @@ -578,7 +568,7 @@ finish_segs: if (segs) { while (segs) { skb2 = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; rc = qdisc_enqueue(segs, sch, to_free); @@ -1032,7 +1022,7 @@ static void netem_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); if (q->qdisc) - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); dist_free(q->delay_dist); dist_free(q->slot_dist); } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 222e53d3d27a..f8af98621179 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch) tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) - qdisc_destroy(q->queues[prio]); + qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, @@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, extack); if (!queues[i]) { while (i > oldbands) - qdisc_destroy(queues[--i]); + qdisc_put(queues[--i]); return -ENOMEM; } } @@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_destroy(child); + qdisc_put(child); } for (i = oldbands; i < q->bands; i++) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bb1a9c11fc54..dc37c4ead439 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -526,7 +526,7 @@ set_change_agg: return 0; destroy_class: - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); return err; } @@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 56c181c3feeb..3ce6c0a2c493 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch) del_timer_sync(&q->adapt_timer); red_offload(sch, false); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { @@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 7cbdad8419b7..bab506b01a32 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { @@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 6f74a426f159..942dcca09cf2 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, nb = 0; while (segs) { nskb = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; ret = qdisc_enqueue(segs, q->qdisc, to_free); @@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; } q->limit = qopt->limit; @@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 12cac85da994..033696e6f74f 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + struct sock *sk = t->asoc->base.sk; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { @@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pmtu = SCTP_TRUNC4(pmtu); if (dst) { - dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); + struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); + union sctp_addr addr; + + pf->af->from_sk(&addr, sk); + pf->to_sk_daddr(&t->ipaddr, sk); + dst->ops->update_pmtu(dst, sk, NULL, pmtu); + pf->to_sk_daddr(&addr, sk); + dst = sctp_transport_dst_check(t); } if (!dst) { - t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0b427100b0d4..331cc734e3db 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -459,7 +459,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul * element in the queue, then count it towards * possible PD. */ - if (pos == ulpq->reasm.next) { + if (skb_queue_is_first(&ulpq->reasm, pos)) { pd_first = pos; pd_last = pos; pd_len = pos->len; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2d8a1e15e4f9..015231789ed2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -742,7 +742,10 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - smc->sk.sk_state_change(&smc->sk); + if (smc->sk.sk_err) + smc->sk.sk_state_change(&smc->sk); + else + smc->sk.sk_write_space(&smc->sk); kfree(smc->connect_info); smc->connect_info = NULL; release_sock(&smc->sk); @@ -1150,9 +1153,9 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) } /* listen worker: finish RDMA setup */ -static void smc_listen_rdma_finish(struct smc_sock *new_smc, - struct smc_clc_msg_accept_confirm *cclc, - int local_contact) +static int smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) { struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; int reason_code = 0; @@ -1175,11 +1178,12 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc, if (reason_code) goto decline; } - return; + return 0; decline: mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); + return reason_code; } /* setup for RDMA connection of server */ @@ -1276,8 +1280,10 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - if (!ism_supported) - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) { + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + return; + } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); @@ -1529,7 +1535,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return EPOLLNVAL; smc = smc_sk(sock->sk); - if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; @@ -1560,9 +1566,9 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; } - if (smc->conn.urg_state == SMC_URG_VALID) - mask |= EPOLLPRI; } return mask; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 83aba9ade060..52241d679cc9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -446,14 +446,12 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); - if (len < sizeof(pclc)) { - if (len >= 0) { - reason_code = -ENETUNREACH; - smc->sk.sk_err = -reason_code; - } else { - smc->sk.sk_err = smc->clcsock->sk->sk_err; - reason_code = -smc->sk.sk_err; - } + if (len < 0) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } else if (len < (int)sizeof(pclc)) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; } return reason_code; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ac961dfb1ea1..ea2b87f29469 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -100,15 +100,14 @@ static void smc_close_active_abort(struct smc_sock *smc) struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { + sk->sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_err = ECONNABORTED; + smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + } } switch (sk->sk_state) { - case SMC_INIT: - sk->sk_state = SMC_PEERABORTWAIT; - break; case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; release_sock(sk); @@ -143,6 +142,7 @@ static void smc_close_active_abort(struct smc_sock *smc) case SMC_PEERFINCLOSEWAIT: sock_put(sk); /* passive closing */ break; + case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: break; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 01c6ce042a1c..7cb3e4f07c10 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -461,7 +461,7 @@ static const struct genl_ops smc_pnet_ops[] = { }; /* SMC_PNETID family definition */ -static struct genl_family smc_pnet_nl_family = { +static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, diff --git a/net/socket.c b/net/socket.c index e6945e318f02..01f3f8f32d6f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -941,7 +941,8 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) EXPORT_SYMBOL(dlci_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg, + unsigned int ifreq_size) { int err; void __user *argp = (void __user *)arg; @@ -967,11 +968,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, } else { struct ifreq ifr; bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (copy_from_user(&ifr, argp, ifreq_size)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + if (copy_to_user(argp, &ifr, ifreq_size)) return -EFAULT; } return err; @@ -1070,7 +1071,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; default: - err = sock_do_ioctl(net, sock, cmd, arg); + err = sock_do_ioctl(net, sock, cmd, arg, + sizeof(struct ifreq)); break; } return err; @@ -2750,7 +2752,8 @@ static int do_siocgstamp(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv, + sizeof(struct compat_ifreq)); set_fs(old_fs); if (!err) err = compat_put_timeval(&ktv, up); @@ -2766,7 +2769,8 @@ static int do_siocgstampns(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts, + sizeof(struct compat_ifreq)); set_fs(old_fs); if (!err) err = compat_put_timespec(&kts, up); @@ -3072,7 +3076,8 @@ static int routing_ioctl(struct net *net, struct socket *sock, } set_fs(KERNEL_DS); - ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); + ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r, + sizeof(struct compat_ifreq)); set_fs(old_fs); out: @@ -3185,7 +3190,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCGIFNAME: - return sock_do_ioctl(net, sock, cmd, arg); + return sock_do_ioctl(net, sock, cmd, arg, + sizeof(struct compat_ifreq)); } return -ENOIOCTLCMD; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 418f03d0be90..91891041e5e1 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -577,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_dereference_rtnl(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { - skb->next = NULL; + skb_mark_not_on_list(skb); tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index a2f76743c73a..6376467e78f8 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -185,6 +185,10 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; buf->sk = msg->dst_sk; + if (__tipc_dump_start(&cb, msg->net)) { + kfree_skb(buf); + return -ENOMEM; + } do { int rem; @@ -216,6 +220,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, err = 0; err_out: + tipc_dump_done(&cb); kfree_skb(buf); if (err == -EMSGSIZE) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ab7a2a7178f7..3f03ddd0e35b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -576,6 +576,7 @@ static int tipc_release(struct socket *sock) sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); + sock_orphan(sk); /* Reject any messages that accumulated in backlog queue */ release_sock(sk); tipc_dest_list_purge(&tsk->cong_links); @@ -3229,7 +3230,7 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, struct netlink_callback *cb, struct tipc_sock *tsk)) { - struct rhashtable_iter *iter = (void *)cb->args[0]; + struct rhashtable_iter *iter = (void *)cb->args[4]; struct tipc_sock *tsk; int err; @@ -3265,8 +3266,14 @@ EXPORT_SYMBOL(tipc_nl_sk_walk); int tipc_dump_start(struct netlink_callback *cb) { - struct rhashtable_iter *iter = (void *)cb->args[0]; - struct net *net = sock_net(cb->skb->sk); + return __tipc_dump_start(cb, sock_net(cb->skb->sk)); +} +EXPORT_SYMBOL(tipc_dump_start); + +int __tipc_dump_start(struct netlink_callback *cb, struct net *net) +{ + /* tipc_nl_name_table_dump() uses cb->args[0...3]. */ + struct rhashtable_iter *iter = (void *)cb->args[4]; struct tipc_net *tn = tipc_net(net); if (!iter) { @@ -3274,17 +3281,16 @@ int tipc_dump_start(struct netlink_callback *cb) if (!iter) return -ENOMEM; - cb->args[0] = (long)iter; + cb->args[4] = (long)iter; } rhashtable_walk_enter(&tn->sk_rht, iter); return 0; } -EXPORT_SYMBOL(tipc_dump_start); int tipc_dump_done(struct netlink_callback *cb) { - struct rhashtable_iter *hti = (void *)cb->args[0]; + struct rhashtable_iter *hti = (void *)cb->args[4]; rhashtable_walk_exit(hti); kfree(hti); diff --git a/net/tipc/socket.h b/net/tipc/socket.h index d43032e26532..5e575f205afe 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -69,5 +69,6 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, struct netlink_callback *cb, struct tipc_sock *tsk)); int tipc_dump_start(struct netlink_callback *cb); +int __tipc_dump_start(struct netlink_callback *cb, struct net *net); int tipc_dump_done(struct netlink_callback *cb); #endif diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 292742e50bfa..961b07d4d41c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -686,7 +686,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_marker_record; } - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -780,7 +780,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, - &ctx->crypto_send, + &ctx->crypto_send.info, tcp_sk(sk)->write_seq); if (rc) goto release_netdev; @@ -862,7 +862,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, - &ctx->crypto_recv, + &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); if (rc) { pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 6102169239d1..450a6dbc5a88 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -320,7 +320,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, goto free_req; iv = buf; - memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt, + memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 180b6640e531..b428069a1b05 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -141,7 +141,6 @@ retry: size = sg->length; } - clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); ctx->in_tcp_sendpages = false; ctx->sk_write_space(sk); @@ -193,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, return rc; } -int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, - int flags, long *timeo) +int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, + int flags) { struct scatterlist *sg; u16 offset; - if (!tls_is_partially_sent_record(ctx)) - return ctx->push_pending_record(sk, flags); - sg = ctx->partially_sent_record; offset = ctx->partially_sent_offset; @@ -209,9 +205,23 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } +int tls_push_pending_closed_record(struct sock *sk, + struct tls_context *tls_ctx, + int flags, long *timeo) +{ + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (tls_is_partially_sent_record(tls_ctx) || + !list_empty(&ctx->tx_list)) + return tls_tx_records(sk, flags); + else + return tls_ctx->push_pending_record(sk, flags); +} + static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* If in_tcp_sendpages call lower protocol write space handler * to ensure we wake up any waiting operations there. For example @@ -222,25 +232,26 @@ static void tls_write_space(struct sock *sk) return; } - if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { - gfp_t sk_allocation = sk->sk_allocation; - int rc; - long timeo = 0; - - sk->sk_allocation = GFP_ATOMIC; - rc = tls_push_pending_closed_record(sk, ctx, - MSG_DONTWAIT | - MSG_NOSIGNAL, - &timeo); - sk->sk_allocation = sk_allocation; - - if (rc < 0) - return; + /* Schedule the transmission if tx list is ready */ + if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) { + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) + schedule_delayed_work(&tx_ctx->tx_work.work, 0); } ctx->sk_write_space(sk); } +static void tls_ctx_free(struct tls_context *ctx) +{ + if (!ctx) + return; + + memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); + memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); + kfree(ctx); +} + static void tls_sk_proto_close(struct sock *sk, long timeout) { struct tls_context *ctx = tls_get_ctx(sk); @@ -260,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); - if (ctx->partially_sent_record) { - struct scatterlist *sg = ctx->partially_sent_record; - - while (1) { - put_page(sg_page(sg)); - sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; - } - } - /* We need these for tls_sw_fallback handling of other packets */ if (ctx->tx_conf == TLS_SW) { kfree(ctx->tx.rec_seq); @@ -294,7 +292,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #else { #endif - kfree(ctx); + tls_ctx_free(ctx); ctx = NULL; } @@ -305,7 +303,7 @@ skip_tx_cleanup: * for sk->sk_prot->unhash [tls_hw_unhash] */ if (free_ctx) - kfree(ctx); + tls_ctx_free(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -330,7 +328,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } /* get user crypto info */ - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,9 +415,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } if (tx) - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; else - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { @@ -499,7 +497,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; err_crypto_info: - memset(crypto_info, 0, sizeof(*crypto_info)); + memzero_explicit(crypto_info, sizeof(union tls_crypto_context)); out: return rc; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index be4f2e990f9f..4c18b4dba284 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -122,25 +122,32 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len) static void tls_decrypt_done(struct crypto_async_request *req, int err) { struct aead_request *aead_req = (struct aead_request *)req; - struct decrypt_req_ctx *req_ctx = - (struct decrypt_req_ctx *)(aead_req + 1); - struct scatterlist *sgout = aead_req->dst; - - struct tls_context *tls_ctx = tls_get_ctx(req_ctx->sk); - struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - int pending = atomic_dec_return(&ctx->decrypt_pending); + struct tls_sw_context_rx *ctx; + struct tls_context *tls_ctx; struct scatterlist *sg; + struct sk_buff *skb; unsigned int pages; + int pending; + + skb = (struct sk_buff *)req->data; + tls_ctx = tls_get_ctx(skb->sk); + ctx = tls_sw_ctx_rx(tls_ctx); + pending = atomic_dec_return(&ctx->decrypt_pending); /* Propagate if there was an err */ if (err) { ctx->async_wait.err = err; - tls_err_abort(req_ctx->sk, err); + tls_err_abort(skb->sk, err); } + /* After using skb->sk to propagate sk through crypto async callback + * we need to NULL it again. + */ + skb->sk = NULL; + /* Release the skb, pages and memory allocated for crypto req */ - kfree_skb(req->data); + kfree_skb(skb); /* Skip the first S/G entry as it points to AAD */ for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { @@ -175,11 +182,13 @@ static int tls_do_decryption(struct sock *sk, (u8 *)iv_recv); if (async) { - struct decrypt_req_ctx *req_ctx; - - req_ctx = (struct decrypt_req_ctx *)(aead_req + 1); - req_ctx->sk = sk; - + /* Using skb->sk to push sk through to crypto async callback + * handler. This allows propagating errors up to the socket + * if needed. It _must_ be cleared in the async handler + * before kfree_skb is called. We _know_ skb->sk is NULL + * because it is a clone from strparser. + */ + skb->sk = sk; aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_decrypt_done, skb); @@ -237,18 +246,19 @@ static void trim_both_sgl(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, + trim_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, + trim_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, target_size); } @@ -256,12 +266,16 @@ static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; int rc = 0; rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, 0, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, 0); + rec->sg_encrypted_data, 0, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, 0); + + if (rc == -ENOSPC) + rec->sg_encrypted_num_elem = ARRAY_SIZE(rec->sg_encrypted_data); return rc; } @@ -270,12 +284,16 @@ static int alloc_plaintext_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + rc = sk_alloc_sg(sk, len, rec->sg_plaintext_data, 0, + &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, tls_ctx->pending_open_record_frags); + if (rc == -ENOSPC) + rec->sg_plaintext_num_elem = ARRAY_SIZE(rec->sg_plaintext_data); + return rc; } @@ -292,41 +310,190 @@ static void free_sg(struct sock *sk, struct scatterlist *sg, *sg_size = 0; } -static void tls_free_both_sg(struct sock *sk) +static void tls_free_open_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + + /* Return if there is no open record */ + if (!rec) + return; + + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size); + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + + kfree(rec); +} + +int tls_tx_records(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + int tx_flags, rc = 0; + + if (tls_is_partially_sent_record(tls_ctx)) { + rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; - free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size); + rc = tls_push_partial_record(sk, tls_ctx, tx_flags); + if (rc) + goto tx_err; - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); + /* Full record has been transmitted. + * Remove the head of tx_list + */ + list_del(&rec->list); + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); + + kfree(rec); + } + + /* Tx all ready records */ + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { + if (READ_ONCE(rec->tx_ready)) { + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; + + rc = tls_push_sg(sk, tls_ctx, + &rec->sg_encrypted_data[0], + 0, tx_flags); + if (rc) + goto tx_err; + + list_del(&rec->list); + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + + kfree(rec); + } else { + break; + } + } + +tx_err: + if (rc < 0 && rc != -EAGAIN) + tls_err_abort(sk, EBADMSG); + + return rc; } -static int tls_do_encryption(struct tls_context *tls_ctx, +static void tls_encrypt_done(struct crypto_async_request *req, int err) +{ + struct aead_request *aead_req = (struct aead_request *)req; + struct sock *sk = req->data; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec; + bool ready = false; + int pending; + + rec = container_of(aead_req, struct tls_rec, aead_req); + + rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + + + /* Free the record if error is previously set on socket */ + if (err || sk->sk_err) { + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); + + kfree(rec); + rec = NULL; + + /* If err is already set on socket, return the same code */ + if (sk->sk_err) { + ctx->async_wait.err = sk->sk_err; + } else { + ctx->async_wait.err = err; + tls_err_abort(sk, err); + } + } + + if (rec) { + struct tls_rec *first_rec; + + /* Mark the record as ready for transmission */ + smp_store_mb(rec->tx_ready, true); + + /* If received record is at head of tx_list, schedule tx */ + first_rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + if (rec == first_rec) + ready = true; + } + + pending = atomic_dec_return(&ctx->encrypt_pending); + + if (!pending && READ_ONCE(ctx->async_notify)) + complete(&ctx->async_wait.completion); + + if (!ready) + return; + + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + schedule_delayed_work(&ctx->tx_work.work, 1); +} + +static int tls_do_encryption(struct sock *sk, + struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, size_t data_len) { + struct tls_rec *rec = ctx->open_rec; int rc; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &ctx->async_wait); + tls_encrypt_done, sk); + + /* Add the record in tx_list */ + list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); + atomic_inc(&ctx->encrypt_pending); - rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); + rc = crypto_aead_encrypt(aead_req); + if (!rc || rc != -EINPROGRESS) { + atomic_dec(&ctx->encrypt_pending); + rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + } - ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + if (!rc) { + WRITE_ONCE(rec->tx_ready, true); + } else if (rc != -EINPROGRESS) { + list_del(&rec->list); + return rc; + } + /* Unhook the record from context if encryption is not failure */ + ctx->open_rec = NULL; + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } @@ -335,53 +502,40 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; struct aead_request *req; int rc; - req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); - if (!req) - return -ENOMEM; + if (!rec) + return 0; + + rec->tx_flags = flags; + req = &rec->aead_req; - sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); - sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); + sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem - 1); + sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem - 1); - tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, + tls_make_aad(rec->aad_space, rec->sg_plaintext_size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&ctx->sg_encrypted_data[0])) + - ctx->sg_encrypted_data[0].offset, - ctx->sg_plaintext_size, record_type); + page_address(sg_page(&rec->sg_encrypted_data[0])) + + rec->sg_encrypted_data[0].offset, + rec->sg_plaintext_size, record_type); tls_ctx->pending_open_record_frags = 0; - set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); - if (rc < 0) { - /* If we are called from write_space and - * we fail, we need to set this SOCK_NOSPACE - * to trigger another write_space in the future. - */ - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto out_req; - } - - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); - - ctx->sg_encrypted_num_elem = 0; - ctx->sg_encrypted_size = 0; + rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size); + if (rc == -EINPROGRESS) + return -EINPROGRESS; - /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ - rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); - if (rc < 0 && rc != -EAGAIN) + if (rc < 0) { tls_err_abort(sk, EBADMSG); + return rc; + } - tls_advance_record_sn(sk, &tls_ctx->tx); -out_req: - aead_request_free(req); - return rc; + return tls_tx_records(sk, flags); } static int tls_sw_push_pending_record(struct sock *sk, int flags) @@ -458,11 +612,12 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct scatterlist *sg = ctx->sg_plaintext_data; + struct tls_rec *rec = ctx->open_rec; + struct scatterlist *sg = rec->sg_plaintext_data; int copy, i, rc = 0; for (i = tls_ctx->pending_open_record_frags; - i < ctx->sg_plaintext_num_elem; ++i) { + i < rec->sg_plaintext_num_elem; ++i) { copy = sg[i].length; if (copy_from_iter( page_address(sg_page(&sg[i])) + sg[i].offset, @@ -482,33 +637,85 @@ out: return rc; } -int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +struct tls_rec *get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; - int required_size; + struct tls_rec *rec; + int mem_size; + + /* Return if we already have an open record */ + if (ctx->open_rec) + return ctx->open_rec; + + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + + rec = kzalloc(mem_size, sk->sk_allocation); + if (!rec) + return NULL; + + sg_init_table(&rec->sg_plaintext_data[0], + ARRAY_SIZE(rec->sg_plaintext_data)); + sg_init_table(&rec->sg_encrypted_data[0], + ARRAY_SIZE(rec->sg_encrypted_data)); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); + sg_chain(rec->sg_aead_in, 2, rec->sg_plaintext_data); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); + sg_chain(rec->sg_aead_out, 2, rec->sg_encrypted_data); + + ctx->open_rec = rec; + + return rec; +} + +int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send); + bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + unsigned char record_type = TLS_RECORD_TYPE_DATA; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; - unsigned char record_type = TLS_RECORD_TYPE_DATA; - int record_room; + struct tls_rec *rec; + int required_size; + int num_async = 0; bool full_record; + int record_room; + int num_zc = 0; int orig_size; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + int ret = 0; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; lock_sock(sk); - if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) - goto send_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto send_end; + } if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); - if (ret) - goto send_end; + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; + } } while (msg_data_left(msg)) { @@ -517,20 +724,27 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - orig_size = ctx->sg_plaintext_size; + rec = get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto send_end; + } + + orig_size = rec->sg_plaintext_size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + try_to_copy + + required_size = rec->sg_plaintext_size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + alloc_encrypted: ret = alloc_encrypted_sg(sk, required_size); if (ret) { @@ -541,33 +755,39 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_encrypted_size; + try_to_copy -= required_size - rec->sg_encrypted_size; full_record = true; } - if (!is_kvec && (full_record || eor)) { + + if (!is_kvec && (full_record || eor) && !async_capable) { ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, - ctx->sg_plaintext_data, - ARRAY_SIZE(ctx->sg_plaintext_data), + try_to_copy, &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, + rec->sg_plaintext_data, + ARRAY_SIZE(rec->sg_plaintext_data), true); if (ret) goto fallback_to_reg_send; + num_zc++; copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); - if (ret) - goto send_end; + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; + } continue; fallback_to_reg_send: - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, + trim_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, orig_size); } - required_size = ctx->sg_plaintext_size + try_to_copy; + required_size = rec->sg_plaintext_size + try_to_copy; alloc_plaintext: ret = alloc_plaintext_sg(sk, required_size); if (ret) { @@ -578,13 +798,13 @@ alloc_plaintext: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_plaintext_size; + try_to_copy -= required_size - rec->sg_plaintext_size; full_record = true; - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, - ctx->sg_plaintext_size + + trim_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, + rec->sg_plaintext_size + tls_ctx->tx.overhead_size); } @@ -594,13 +814,12 @@ alloc_plaintext: copied += try_to_copy; if (full_record || eor) { -push_record: ret = tls_push_record(sk, msg->msg_flags, record_type); if (ret) { - if (ret == -ENOMEM) - goto wait_for_memory; - - goto send_end; + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; } } @@ -616,15 +835,37 @@ trim_sgl: goto send_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - - if (ctx->sg_encrypted_size < required_size) + if (rec->sg_encrypted_size < required_size) goto alloc_encrypted; goto alloc_plaintext; } + if (!num_async) { + goto send_end; + } else if (num_zc) { + /* Wait for pending encryptions to get completed */ + smp_store_mb(ctx->async_notify, true); + + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + else + reinit_completion(&ctx->async_wait.completion); + + WRITE_ONCE(ctx->async_notify, false); + + if (ctx->async_wait.err) { + ret = ctx->async_wait.err; + copied = 0; + } + } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } + send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); @@ -635,16 +876,18 @@ send_end: int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - bool eor; - size_t orig_size = size; unsigned char record_type = TLS_RECORD_TYPE_DATA; + size_t orig_size = size; struct scatterlist *sg; + struct tls_rec *rec; + int num_async = 0; bool full_record; int record_room; + int ret = 0; + bool eor; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) @@ -657,8 +900,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) - goto sendpage_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto sendpage_end; + } /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { @@ -669,14 +916,20 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } + rec = get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto sendpage_end; + } + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + copy + + required_size = rec->sg_plaintext_size + copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) @@ -691,33 +944,32 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - ctx->sg_plaintext_size; + copy -= required_size - rec->sg_plaintext_size; full_record = true; } get_page(page); - sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; + sg = rec->sg_plaintext_data + rec->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); sg_unmark_end(sg); - ctx->sg_plaintext_num_elem++; + rec->sg_plaintext_num_elem++; sk_mem_charge(sk, copy); offset += copy; size -= copy; - ctx->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem; + rec->sg_plaintext_size += copy; + tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem; if (full_record || eor || - ctx->sg_plaintext_num_elem == - ARRAY_SIZE(ctx->sg_plaintext_data)) { -push_record: + rec->sg_plaintext_num_elem == + ARRAY_SIZE(rec->sg_plaintext_data)) { ret = tls_push_record(sk, flags, record_type); if (ret) { - if (ret == -ENOMEM) - goto wait_for_memory; - - goto sendpage_end; + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto sendpage_end; } } continue; @@ -726,16 +978,20 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, ctx->sg_plaintext_size); + trim_both_sgl(sk, rec->sg_plaintext_size); goto sendpage_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - goto alloc_payload; } + if (num_async) { + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, flags); + } + } sendpage_end: if (orig_size > size) ret = orig_size - size; @@ -1092,6 +1348,13 @@ pick_next_record: } else { break; } + } else { + /* MSG_PEEK right now cannot look beyond current skb + * from strparser, meaning we cannot advance skb here + * and thus unpause strparser since we'd loose original + * one. + */ + break; } /* If we have a new message from strparser, continue now. */ @@ -1233,8 +1496,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } - if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || - header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) { ret = -EINVAL; goto read_failure; } @@ -1276,9 +1539,61 @@ void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + + /* Wait for any pending async encryptions to complete */ + smp_store_mb(ctx->async_notify, true); + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + + cancel_delayed_work_sync(&ctx->tx_work.work); + + /* Tx whatever records we can transmit and abandon the rest */ + tls_tx_records(sk, -1); + + /* Free up un-sent records in tx_list. First, free + * the partially sent record if any at head of tx_list. + */ + if (tls_ctx->partially_sent_record) { + struct scatterlist *sg = tls_ctx->partially_sent_record; + + while (1) { + put_page(sg_page(sg)); + sk_mem_uncharge(sk, sg->length); + + if (sg_is_last(sg)) + break; + sg++; + } + + tls_ctx->partially_sent_record = NULL; + + rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + + list_del(&rec->list); + kfree(rec); + } + + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size); + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + + list_del(&rec->list); + kfree(rec); + } crypto_free_aead(ctx->aead_send); - tls_free_both_sg(sk); + tls_free_open_rec(sk); kfree(ctx); } @@ -1312,9 +1627,26 @@ void tls_sw_free_resources_rx(struct sock *sk) kfree(ctx); } +/* The work handler to transmitt the encrypted records in tx_list */ +static void tx_work_handler(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct tx_work *tx_work = container_of(delayed_work, + struct tx_work, work); + struct sock *sk = tx_work->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + return; + + lock_sock(sk); + tls_tx_records(sk, -1); + release_sock(sk); +} + int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { - char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; @@ -1359,12 +1691,15 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (tx) { crypto_init_wait(&sw_ctx_tx->async_wait); - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; + INIT_LIST_HEAD(&sw_ctx_tx->tx_list); + INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); + sw_ctx_tx->tx_work.sk = sk; } else { crypto_init_wait(&sw_ctx_rx->async_wait); - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } @@ -1412,26 +1747,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_iv; } - if (sw_ctx_tx) { - sg_init_table(sw_ctx_tx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data)); - sg_init_table(sw_ctx_tx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data)); - - sg_init_table(sw_ctx_tx->sg_aead_in, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]); - sg_chain(sw_ctx_tx->sg_aead_in, 2, - sw_ctx_tx->sg_plaintext_data); - sg_init_table(sw_ctx_tx->sg_aead_out, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]); - sg_chain(sw_ctx_tx->sg_aead_out, 2, - sw_ctx_tx->sg_encrypted_data); - } - if (!*aead) { *aead = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(*aead)) { @@ -1443,9 +1758,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ctx->push_pending_record = tls_sw_push_pending_record; - memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - - rc = crypto_aead_setkey(*aead, keyval, + rc = crypto_aead_setkey(*aead, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; @@ -1455,8 +1768,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_aead; if (sw_ctx_rx) { - (*aead)->reqsize = sizeof(struct decrypt_req_ctx); - /* Set up strparser */ memset(&cb, 0, sizeof(cb)); cb.rcv_msg = tls_queue; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b3b632c5aeae..555427b3e0fe 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } + xsk_reuseq_destroy(umem); + xdp_umem_unpin_pages(umem); task = get_pid_task(umem->pid, PIDTYPE_PID); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 2dc1384d9f27..b66504592d9b 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -3,7 +3,9 @@ * Copyright(c) 2018 Intel Corporation. */ +#include <linux/log2.h> #include <linux/slab.h> +#include <linux/overflow.h> #include "xsk_queue.h" @@ -62,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } + +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + struct xdp_umem_fq_reuse *newq; + + /* Check for overflow */ + if (nentries > (u32)roundup_pow_of_two(nentries)) + return NULL; + nentries = roundup_pow_of_two(nentries); + + newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); + if (!newq) + return NULL; + memset(newq, 0, offsetof(typeof(*newq), handles)); + + newq->nentries = nentries; + return newq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); + +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq) +{ + struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; + + if (!oldq) { + umem->fq_reuse = newq; + return NULL; + } + + if (newq->nentries < oldq->length) + return newq; + + memcpy(newq->handles, oldq->handles, + array_size(oldq->length, sizeof(u64))); + newq->length = oldq->length; + + umem->fq_reuse = newq; + return oldq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_swap); + +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ + kvfree(rq); +} +EXPORT_SYMBOL_GPL(xsk_reuseq_free); + +void xsk_reuseq_destroy(struct xdp_umem *umem) +{ + xsk_reuseq_free(umem->fq_reuse); + umem->fq_reuse = NULL; +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 82252cccb4e0..bcb5cbb40419 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -258,4 +258,7 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); +/* Executed by the core when the entire UMEM gets freed */ +void xsk_reuseq_destroy(struct xdp_umem *umem); + #endif /* _LINUX_XSK_QUEUE_H */ diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5611b7521020..260fbba4f03e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur do { struct sk_buff *nskb = skb2->next; - skb2->next = NULL; + skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 45ba07ab3e4f..2d42cb0c94b8 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -189,7 +189,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb struct sk_buff *nskb = segs->next; int err; - segs->next = NULL; + skb_mark_not_on_list(segs); err = xfrm_output2(net, sk, segs); if (unlikely(err)) { |