diff options
Diffstat (limited to 'net')
211 files changed, 4027 insertions, 2451 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c index 4fa2fdda174d..9e56fb98f33c 100644 --- a/net/6lowpan/nhc.c +++ b/net/6lowpan/nhc.c @@ -18,7 +18,7 @@ #include "nhc.h" static struct rb_root rb_root = RB_ROOT; -static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX]; +static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1]; static DEFINE_SPINLOCK(lowpan_nhc_lock); static int lowpan_nhc_insert(struct lowpan_nhc *nhc) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index dc4411165e43..1f99678751df 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -75,6 +75,14 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, return 0; } +static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev, + struct vlan_dev_priv *vlan) +{ + if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) + netif_stacked_transfer_operstate(rootdev, dev); +} + void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -180,7 +188,7 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) /* Account for reference in struct vlan_dev_priv */ dev_hold(real_dev); - netif_stacked_transfer_operstate(real_dev, dev); + vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place @@ -399,7 +407,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, + vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: @@ -446,7 +455,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, + vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); @@ -463,7 +473,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 8d77b6ee4477..f044ae56a313 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -223,7 +223,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); @@ -296,7 +297,8 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); - if (netif_carrier_ok(real_dev)) + if (netif_carrier_ok(real_dev) && + !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; @@ -326,7 +328,8 @@ static int vlan_dev_stop(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); - netif_carrier_off(dev); + if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) + netif_carrier_off(dev); return 0; } @@ -550,7 +553,8 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); @@ -561,6 +565,9 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); + if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) + dev->state |= (1 << __LINK_STATE_NOCARRIER); + dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | @@ -591,8 +598,7 @@ static int vlan_dev_init(struct net_device *dev) #endif dev->needed_headroom = real_dev->needed_headroom; - if (vlan_hw_offload_capable(real_dev->features, - vlan_dev_priv(dev)->vlan_proto)) { + if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { @@ -606,8 +612,8 @@ static int vlan_dev_init(struct net_device *dev) vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); - vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); - if (!vlan_dev_priv(dev)->vlan_pcpu_stats) + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); + if (!vlan->vlan_pcpu_stats) return -ENOMEM; return 0; diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 9b60c1e399e2..a624dccf68fd 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -84,7 +84,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) { + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 709d2542f729..e2511027d19b 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1806,12 +1806,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = put_user(amount, (int __user *)argp); break; } - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; - case SIOCGSTAMPNS: - rc = sock_get_timestampns(sk, argp); - break; /* Routing */ case SIOCADDRT: case SIOCDELRT: @@ -1871,6 +1865,7 @@ static const struct proto_ops atalk_dgram_ops = { .getname = atalk_getname, .poll = datagram_poll, .ioctl = atalk_ioctl, + .gettstamp = sock_gettstamp, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, #endif diff --git a/net/atm/clip.c b/net/atm/clip.c index d795b9c5aea4..b9e67e589a7b 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,8 +345,8 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } rt = (struct rtable *) dst; - if (rt->rt_gateway) - daddr = &rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index 2ff0e5e470e3..d955b683aa7c 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -81,22 +81,6 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, (int __user *)argp) ? -EFAULT : 0; goto done; } - case SIOCGSTAMP: /* borrowed from IP */ -#ifdef CONFIG_COMPAT - if (compat) - error = compat_sock_get_timestamp(sk, argp); - else -#endif - error = sock_get_timestamp(sk, argp); - goto done; - case SIOCGSTAMPNS: /* borrowed from IP */ -#ifdef CONFIG_COMPAT - if (compat) - error = compat_sock_get_timestampns(sk, argp); - else -#endif - error = sock_get_timestampns(sk, argp); - goto done; case ATM_SETSC: net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n", current->comm, task_pid_nr(current)); diff --git a/net/atm/lec.c b/net/atm/lec.c index d7f5cf5b7594..ad4f829193f0 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -710,7 +710,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { - if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg]) + if (arg < 0 || arg >= MAX_LEC_ITF) + return -EINVAL; + arg = array_index_nospec(arg, MAX_LEC_ITF); + if (!dev_lec[arg]) return -EINVAL; vcc->proto_data = dev_lec[arg]; return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); @@ -728,6 +731,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) i = arg; if (arg >= MAX_LEC_ITF) return -EINVAL; + i = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[i]) { int size; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2cb10af16afc..02bd2a436bdf 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -118,6 +118,7 @@ static const struct proto_ops pvc_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = pvc_shutdown, .setsockopt = pvc_setsockopt, diff --git a/net/atm/svc.c b/net/atm/svc.c index 2f91b766ac42..908cbb8654f5 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -641,6 +641,7 @@ static const struct proto_ops svc_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = svc_listen, .shutdown = svc_shutdown, .setsockopt = svc_setsockopt, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 5d01edf8d819..012c0b6fc4f6 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1714,14 +1714,6 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } - case SIOCGSTAMP: - res = sock_get_timestamp(sk, argp); - break; - - case SIOCGSTAMPNS: - res = sock_get_timestampns(sk, argp); - break; - case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ case SIOCAX25GETUID: { @@ -1888,8 +1880,8 @@ static int ax25_info_show(struct seq_file *seq, void *v) * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode */ - seq_printf(seq, "%8.8lx %s %s%s ", - (long) ax25, + seq_printf(seq, "%p %s %s%s ", + ax25, ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name, ax2asc(buf, &ax25->source_addr), ax25->iamdigi? "*":""); @@ -1950,6 +1942,7 @@ static const struct proto_ops ax25_proto_ops = { .getname = ax25_getname, .poll = datagram_poll, .ioctl = ax25_ioctl, + .gettstamp = sock_gettstamp, .listen = ax25_listen, .shutdown = ax25_shutdown, .setsockopt = ax25_setsockopt, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 8d12198eaa94..94ddf19998c7 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -521,14 +521,6 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = put_user(amount, (int __user *) arg); break; - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *) arg); - break; - - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *) arg); - break; - default: err = -ENOIOCTLCMD; break; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index bd4978ce8c45..3cf0764d5793 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1276,6 +1276,14 @@ int hci_conn_check_link_mode(struct hci_conn *conn) !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; + /* The minimum encryption key size needs to be enforced by the + * host stack before establishing any L2CAP connections. The + * specification in theory allows a minimum of 1, but to align + * BR/EDR and LE transports, a minimum of 7 is chosen. + */ + if (conn->enc_key_size < HCI_MIN_ENC_KEY_SIZE) + return 0; + return 1; } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index d6b2540ba7f8..3d9175f130b3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1460,8 +1460,6 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->set_bdaddr) ret = hdev->set_bdaddr(hdev, &hdev->public_addr); - else - ret = -EADDRNOTAVAIL; } setup_failed: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 609fd6871c5a..66b631ab0d35 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5433,7 +5433,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) ev->data, ev->length); } - ptr += sizeof(*ev) + ev->length + 1; + ptr += sizeof(*ev) + ev->length; } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 9f85a1943be9..2151913892ce 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -75,6 +75,7 @@ static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user sockfd_put(csock); return err; } + ca.name[sizeof(ca.name)-1] = 0; err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca, sizeof(ca))) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index f17e393b43b4..b53acd6c9a3d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -510,12 +510,12 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) } EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults); -static void l2cap_le_flowctl_init(struct l2cap_chan *chan) +static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits) { chan->sdu = NULL; chan->sdu_last_frag = NULL; chan->sdu_len = 0; - chan->tx_credits = 0; + chan->tx_credits = tx_credits; /* Derive MPS from connection MTU to stop HCI fragmentation */ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); /* Give enough credits for a full packet */ @@ -1281,7 +1281,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; - l2cap_le_flowctl_init(chan); + l2cap_le_flowctl_init(chan, 0); req.psm = chan->psm; req.scid = cpu_to_le16(chan->scid); @@ -5532,11 +5532,10 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, chan->dcid = scid; chan->omtu = mtu; chan->remote_mps = mps; - chan->tx_credits = __le16_to_cpu(req->credits); __l2cap_chan_add(conn, chan); - l2cap_le_flowctl_init(chan); + l2cap_le_flowctl_init(chan, __le16_to_cpu(req->credits)); dcid = chan->scid; credits = chan->rx_credits; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a3a2cd55e23a..a7be8b59b3c2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -791,10 +791,13 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, conn = chan->conn; - /*change security for LE channels */ + /* change security for LE channels */ if (chan->scid == L2CAP_CID_ATT) { - if (smp_conn_security(conn->hcon, sec.level)) + if (smp_conn_security(conn->hcon, sec.level)) { + err = -EINVAL; break; + } + set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -1655,6 +1658,7 @@ static const struct proto_ops l2cap_sock_ops = { .recvmsg = l2cap_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, + .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = l2cap_sock_shutdown, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 2457f408d17d..150114e33b20 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2301,8 +2301,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + key_count * - sizeof(struct mgmt_link_key_info); + expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes", expected_len, len); @@ -5030,7 +5029,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); + expected_len = struct_size(cp, irks, irk_count); if (expected_len != len) { bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes", expected_len, len); @@ -5112,8 +5111,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + key_count * - sizeof(struct mgmt_ltk_info); + expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes", expected_len, len); @@ -5847,8 +5845,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + param_count * - sizeof(struct mgmt_conn_param); + expected_len = struct_size(cp, params, param_count); if (expected_len != len) { bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes", expected_len, len); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b1f49fcc0478..90bb53aa4bee 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1039,6 +1039,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, + .gettstamp = sock_gettstamp, .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9a580999ca57..b91d6b440fdf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -523,12 +523,12 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, struct sock *sk = sock->sk; int err = 0; - BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); - if (!addr || addr_len < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -1190,6 +1190,7 @@ static const struct proto_ops sco_sock_ops = { .recvmsg = sco_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, + .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = sco_sock_shutdown, diff --git a/net/bpf/Makefile b/net/bpf/Makefile index 27b2992a0692..b0ca361742e4 100644 --- a/net/bpf/Makefile +++ b/net/bpf/Makefile @@ -1 +1 @@ -obj-y := test_run.o +obj-$(CONFIG_BPF_SYSCALL) := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fab142b796ef..2221573dacdb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -123,12 +123,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, return data; } +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + + FIELD_SIZEOF(struct __sk_buff, priority), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + + FIELD_SIZEOF(struct __sk_buff, cb), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->priority = __skb->priority; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->priority = skb->priority; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); +} + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; @@ -141,6 +255,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -158,6 +278,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sk = kzalloc(sizeof(struct sock), GFP_USER); if (!sk) { kfree(data); + kfree(ctx); return -ENOMEM; } sock_net_set(sk, current->nsproxy->net_ns); @@ -166,6 +287,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(ctx); kfree(sk); return -ENOMEM; } @@ -180,32 +302,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration); - if (ret) { - kfree_skb(skb); - kfree(sk); - return ret; - } + if (ret) + goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { - kfree_skb(skb); - kfree(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } + convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: kfree_skb(skb); kfree(sk); + kfree(ctx); return ret; } @@ -220,6 +347,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -263,6 +393,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) diff --git a/net/bridge/br.c b/net/bridge/br.c index a5174e5001d8..3c8e4b38f054 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -40,10 +40,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v bool changed_addr; int err; - /* register of bridge completed, add sysfs entries */ - if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { - br_sysfs_addbr(dev); - return NOTIFY_DONE; + if (dev->priv_flags & IFF_EBRIDGE) { + if (event == NETDEV_REGISTER) { + /* register of bridge completed, add sysfs entries */ + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + br_vlan_bridge_event(dev, event, ptr); } /* not a port of a bridge */ @@ -126,6 +129,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; } + if (event != NETDEV_UNREGISTER) + br_vlan_port_event(p, event); + /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 724b474ade54..15116752365a 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -131,7 +131,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u8 *arpptr, *sha; __be32 sip, tip; - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if ((dev->flags & IFF_NOARP) || !pskb_may_pull(skb, arp_hdr_len(dev))) @@ -161,7 +161,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, return; if (ipv4_is_zeronet(sip) || sip == tip) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } } @@ -181,7 +181,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, /* its our local ip, so don't proxy reply * and don't forward to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -217,7 +217,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); @@ -393,7 +393,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, struct ipv6hdr *iphdr; struct neighbour *n; - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if (p && (p->flags & BR_NEIGH_SUPPRESS)) return; @@ -401,7 +401,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -414,7 +414,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -432,7 +432,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, /* its our own ip, so don't proxy reply * and don't forward to arp suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -465,7 +465,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 5ea7e56119c1..014af7efef25 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -16,6 +16,9 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +#include <net/netfilter/nf_queue.h> +#endif #include <linux/neighbour.h> #include <net/arp.h> #include <linux/export.h> @@ -23,10 +26,6 @@ #include "br_private.h" #include "br_private_tunnel.h" -/* Hook for brouter */ -br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; -EXPORT_SYMBOL(br_should_route_hook); - static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -197,13 +196,63 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_bridge_port *p = br_port_get_rcu(skb->dev); - __br_handle_local_finish(skb); - BR_INPUT_SKB_CB(skb)->brdev = p->br->dev; - br_pass_frame_up(skb); - return 0; + /* return 1 to signal the okfn() was called so it's ok to use the skb */ + return 1; +} + +static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) +{ +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + struct nf_hook_entries *e = NULL; + struct nf_hook_state state; + unsigned int verdict, i; + struct net *net; + int ret; + + net = dev_net(skb->dev); +#ifdef HAVE_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) + goto frame_finish; +#endif + + e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); + if (!e) + goto frame_finish; + + nf_hook_state_init(&state, NF_BR_PRE_ROUTING, + NFPROTO_BRIDGE, skb->dev, NULL, NULL, + net, br_handle_frame_finish); + + for (i = 0; i < e->num_hook_entries; i++) { + verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { + *pskb = skb; + return RX_HANDLER_PASS; + } + break; + case NF_DROP: + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + case NF_QUEUE: + ret = nf_queue(skb, &state, e, i, verdict); + if (ret == 1) + continue; + return RX_HANDLER_CONSUMED; + default: /* STOLEN */ + return RX_HANDLER_CONSUMED; + } + } +frame_finish: + net = dev_net(skb->dev); + br_handle_frame_finish(net, NULL, skb); +#else + br_handle_frame_finish(dev_net(skb->dev), NULL, skb); +#endif + return RX_HANDLER_CONSUMED; } /* @@ -215,7 +264,6 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; - br_should_route_hook_t *rhook; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; @@ -227,6 +275,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) if (!skb) return RX_HANDLER_CONSUMED; + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) { if (br_handle_ingress_vlan_tunnel(skb, p, @@ -280,32 +330,28 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } - /* Deliver packet to local host only */ - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), - NULL, skb, skb->dev, NULL, br_handle_local_finish); - return RX_HANDLER_CONSUMED; + /* The else clause should be hit when nf_hook(): + * - returns < 0 (drop/error) + * - returns = 0 (stolen/nf_queue) + * Thus return 1 from the okfn() to signal the skb is ok to pass + */ + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + br_handle_local_finish) == 1) { + return RX_HANDLER_PASS; + } else { + return RX_HANDLER_CONSUMED; + } } forward: switch (p->state) { case BR_STATE_FORWARDING: - rhook = rcu_dereference(br_should_route_hook); - if (rhook) { - if ((*rhook)(skb)) { - *pskb = skb; - return RX_HANDLER_PASS; - } - dest = eth_hdr(skb)->h_dest; - } - /* fall through */ case BR_STATE_LEARNING: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - br_handle_frame_finish); - break; + return nf_hook_bridge_pre(skb, pskb); default: drop: kfree_skb(skb); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 812560d7f7a2..c2a30f79a9d0 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2013,7 +2013,8 @@ static void br_multicast_start_querier(struct net_bridge *br, __br_multicast_open(br, query); - list_for_each_entry(port, &br->port_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(port, &br->port_list, list) { if (port->state == BR_STATE_DISABLED || port->state == BR_STATE_BLOCKING) continue; @@ -2025,6 +2026,7 @@ static void br_multicast_start_querier(struct net_bridge *br, br_multicast_enable(&port->ip6_own_query); #endif } + rcu_read_unlock(); } int br_multicast_toggle(struct net_bridge *br, unsigned long val) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4f9f59eba8b4..8dfcc2d285d8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1441,7 +1441,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, - br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT))) + br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7946aa3b6e09..334a8c496b50 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -321,6 +321,7 @@ enum net_bridge_opts { BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, + BROPT_VLAN_BRIDGE_BINDING, }; struct net_bridge { @@ -425,15 +426,16 @@ struct br_input_skb_cb { struct net_device *brdev; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - int igmp; - int mrouters_only; + u8 igmp; + u8 mrouters_only:1; #endif - - bool proxyarp_replied; - bool src_port_isolated; - + u8 proxyarp_replied:1; + u8 src_port_isolated:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING - bool vlan_filtered; + u8 vlan_filtered:1; +#endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + u8 br_netfilter_broute:1; #endif #ifdef CONFIG_NET_SWITCHDEV @@ -894,6 +896,9 @@ int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats); +void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); +void br_vlan_bridge_event(struct net_device *dev, unsigned long event, + void *ptr); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -1077,6 +1082,16 @@ static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats) { } + +static inline void br_vlan_port_event(struct net_bridge_port *p, + unsigned long event) +{ +} + +static inline void br_vlan_bridge_event(struct net_device *dev, + unsigned long event, void *ptr) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 0a02822b5667..2db63997f313 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -7,6 +7,8 @@ #include "br_private.h" #include "br_private_tunnel.h" +static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); + static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { @@ -293,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, __vlan_add_list(v); __vlan_add_flags(v, flags); + + if (p) + nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; @@ -357,6 +362,7 @@ static int __vlan_del(struct net_bridge_vlan *v) rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); + nbp_vlan_set_vlan_dev_state(p, v->vid); call_rcu(&v->rcu, nbp_vlan_rcu_free); } @@ -1264,3 +1270,211 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); + +static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) +{ + return is_vlan_dev(dev) && + !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); +} + +static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, + __always_unused void *data) +{ + return br_vlan_is_bind_vlan_dev(dev); +} + +static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) +{ + int found; + + rcu_read_lock(); + found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, + NULL); + rcu_read_unlock(); + + return !!found; +} + +struct br_vlan_bind_walk_data { + u16 vid; + struct net_device *result; +}; + +static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, + void *data_in) +{ + struct br_vlan_bind_walk_data *data = data_in; + int found = 0; + + if (br_vlan_is_bind_vlan_dev(dev) && + vlan_dev_priv(dev)->vlan_id == data->vid) { + data->result = dev; + found = 1; + } + + return found; +} + +static struct net_device * +br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) +{ + struct br_vlan_bind_walk_data data = { + .vid = vid, + }; + + rcu_read_lock(); + netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, + &data); + rcu_read_unlock(); + + return data.result; +} + +static bool br_vlan_is_dev_up(const struct net_device *dev) +{ + return !!(dev->flags & IFF_UP) && netif_oper_up(dev); +} + +static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, + struct net_device *vlan_dev) +{ + u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; + bool has_carrier = false; + + if (!netif_carrier_ok(br->dev)) { + netif_carrier_off(vlan_dev); + return; + } + + list_for_each_entry(p, &br->port_list, list) { + vg = nbp_vlan_group(p); + if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { + has_carrier = true; + break; + } + } + + if (has_carrier) + netif_carrier_on(vlan_dev); + else + netif_carrier_off(vlan_dev); +} + +static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + struct net_bridge_vlan *vlan; + struct net_device *vlan_dev; + + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, + vlan->vid); + if (vlan_dev) { + if (br_vlan_is_dev_up(p->dev)) { + if (netif_carrier_ok(p->br->dev)) + netif_carrier_on(vlan_dev); + } else { + br_vlan_set_vlan_dev_state(p->br, vlan_dev); + } + } + } +} + +static void br_vlan_upper_change(struct net_device *dev, + struct net_device *upper_dev, + bool linking) +{ + struct net_bridge *br = netdev_priv(dev); + + if (!br_vlan_is_bind_vlan_dev(upper_dev)) + return; + + if (linking) { + br_vlan_set_vlan_dev_state(br, upper_dev); + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); + } else { + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, + br_vlan_has_upper_bind_vlan_dev(dev)); + } +} + +struct br_vlan_link_state_walk_data { + struct net_bridge *br; +}; + +static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, + void *data_in) +{ + struct br_vlan_link_state_walk_data *data = data_in; + + if (br_vlan_is_bind_vlan_dev(vlan_dev)) + br_vlan_set_vlan_dev_state(data->br, vlan_dev); + + return 0; +} + +static void br_vlan_link_state_change(struct net_device *dev, + struct net_bridge *br) +{ + struct br_vlan_link_state_walk_data data = { + .br = br + }; + + rcu_read_lock(); + netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, + &data); + rcu_read_unlock(); +} + +/* Must be protected by RTNL. */ +static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) +{ + struct net_device *vlan_dev; + + if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) + return; + + vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); + if (vlan_dev) + br_vlan_set_vlan_dev_state(p->br, vlan_dev); +} + +/* Must be protected by RTNL. */ +void br_vlan_bridge_event(struct net_device *dev, unsigned long event, + void *ptr) +{ + struct netdev_notifier_changeupper_info *info; + struct net_bridge *br; + + switch (event) { + case NETDEV_CHANGEUPPER: + info = ptr; + br_vlan_upper_change(dev, info->upper_dev, info->linking); + break; + + case NETDEV_CHANGE: + case NETDEV_UP: + br = netdev_priv(dev); + if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) + return; + br_vlan_link_state_change(dev, br); + break; + } +} + +/* Must be protected by RTNL. */ +void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) +{ + if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) + return; + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_DOWN: + case NETDEV_UP: + br_vlan_set_all_vlan_dev_state(p); + break; + } +} diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 276b60262981..ec2652a459da 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -15,6 +15,8 @@ #include <linux/module.h> #include <linux/if_bridge.h> +#include "../br_private.h" + /* EBT_ACCEPT means the frame will be bridged * EBT_DROP means the frame will be routed */ @@ -48,30 +50,63 @@ static const struct ebt_table broute_table = { .me = THIS_MODULE, }; -static int ebt_broute(struct sk_buff *skb) +static unsigned int ebt_broute(void *priv, struct sk_buff *skb, + const struct nf_hook_state *s) { + struct net_bridge_port *p = br_port_get_rcu(skb->dev); struct nf_hook_state state; + unsigned char *dest; int ret; + if (!p || p->state != BR_STATE_FORWARDING) + return NF_ACCEPT; + nf_hook_state_init(&state, NF_BR_BROUTING, - NFPROTO_BRIDGE, skb->dev, NULL, NULL, - dev_net(skb->dev), NULL); + NFPROTO_BRIDGE, s->in, NULL, NULL, + s->net, NULL); ret = ebt_do_table(skb, &state, state.net->xt.broute_table); - if (ret == NF_DROP) - return 1; /* route it */ - return 0; /* bridge it */ + + if (ret != NF_DROP) + return ret; + + /* DROP in ebtables -t broute means that the + * skb should be routed, not bridged. + * This is awkward, but can't be changed for compatibility + * reasons. + * + * We map DROP to ACCEPT and set the ->br_netfilter_broute flag. + */ + BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1; + + /* undo PACKET_HOST mangling done in br_input in case the dst + * address matches the logical bridge but not the port. + */ + dest = eth_hdr(skb)->h_dest; + if (skb->pkt_type == PACKET_HOST && + !ether_addr_equal(skb->dev->dev_addr, dest) && + ether_addr_equal(p->br->dev->dev_addr, dest)) + skb->pkt_type = PACKET_OTHERHOST; + + return NF_ACCEPT; } +static const struct nf_hook_ops ebt_ops_broute = { + .hook = ebt_broute, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_FIRST, +}; + static int __net_init broute_net_init(struct net *net) { - return ebt_register_table(net, &broute_table, NULL, + return ebt_register_table(net, &broute_table, &ebt_ops_broute, &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table, NULL); + ebt_unregister_table(net, net->xt.broute_table, &ebt_ops_broute); } static struct pernet_operations broute_net_ops = { @@ -81,21 +116,11 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret; - - ret = register_pernet_subsys(&broute_net_ops); - if (ret < 0) - return ret; - /* see br_input.c */ - RCU_INIT_POINTER(br_should_route_hook, - (br_should_route_hook_t *)ebt_broute); - return 0; + return register_pernet_subsys(&broute_net_ops); } static void __exit ebtable_broute_fini(void) { - RCU_INIT_POINTER(br_should_route_hook, NULL); - synchronize_net(); unregister_pernet_subsys(&broute_net_ops); } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index eb15891f8b9f..4e0091311d40 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1221,10 +1221,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, mutex_unlock(&ebt_mutex); WRITE_ONCE(*res, table); - - if (!ops) - return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); @@ -1248,8 +1244,7 @@ out: void ebt_unregister_table(struct net *net, struct ebt_table *table, const struct nf_hook_ops *ops) { - if (ops) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ebt_unregister_table(net, table); } @@ -2032,7 +2027,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - if (WARN_ON(type == EBT_COMPAT_TARGET && size_left)) + /* rule should have no remaining data after target */ + if (type == EBT_COMPAT_TARGET && size_left) return -EINVAL; match32 = (struct compat_ebt_entry_mwt *) buf; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 711d7156efd8..6c6e01963aac 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -186,15 +186,19 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { + struct Qdisc *sch; + /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); - qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc)); - - if (likely(qlen == 0)) + sch = rcu_dereference_bh(txq->qdisc); + if (likely(qdisc_is_empty(sch))) goto noxoff; + /* can check for explicit qdisc len value only !NOLOCK, + * always set flow off otherwise + */ high = (caifd->netdev->tx_queue_len * q_high) / 100; - if (likely(qlen < high)) + if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } diff --git a/net/can/af_can.c b/net/can/af_can.c index 1684ba5b51eb..e8fd5dc1780a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -89,13 +89,7 @@ static atomic_t skbcounter = ATOMIC_INIT(0); int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct sock *sk = sock->sk; - switch (cmd) { - - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - default: return -ENOIOCTLCMD; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 79bb8afa9c0c..a34ee52f19ea 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1689,6 +1689,7 @@ static const struct proto_ops bcm_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/can/raw.c b/net/can/raw.c index c70207537488..afcbff063a67 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -846,6 +846,7 @@ static const struct proto_ops raw_ops = { .getname = raw_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, diff --git a/net/compat.c b/net/compat.c index eeea5eb71639..a031bd333092 100644 --- a/net/compat.c +++ b/net/compat.c @@ -395,63 +395,6 @@ COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __compat_sys_setsockopt(fd, level, optname, optval, optlen); } -int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) -{ - struct compat_timeval __user *ctv; - int err; - struct timeval tv; - - if (COMPAT_USE_64BIT_TIME) - return sock_get_timestamp(sk, userstamp); - - ctv = (struct compat_timeval __user *) userstamp; - err = -ENOENT; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sock_read_timestamp(sk)); - - if (tv.tv_sec == -1) - return err; - if (tv.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - tv = ktime_to_timeval(kt); - } - err = 0; - if (put_user(tv.tv_sec, &ctv->tv_sec) || - put_user(tv.tv_usec, &ctv->tv_usec)) - err = -EFAULT; - return err; -} -EXPORT_SYMBOL(compat_sock_get_timestamp); - -int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct compat_timespec __user *ctv; - int err; - struct timespec ts; - - if (COMPAT_USE_64BIT_TIME) - return sock_get_timestampns (sk, userstamp); - - ctv = (struct compat_timespec __user *) userstamp; - err = -ENOENT; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sock_read_timestamp(sk)); - if (ts.tv_sec == -1) - return err; - if (ts.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - ts = ktime_to_timespec(kt); - } - err = 0; - if (put_user(ts.tv_sec, &ctv->tv_sec) || - put_user(ts.tv_nsec, &ctv->tv_nsec)) - err = -EFAULT; - return err; -} -EXPORT_SYMBOL(compat_sock_get_timestampns); - static int __compat_sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) diff --git a/net/core/datagram.c b/net/core/datagram.c index 91bb5a083fee..45a162ef5e02 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -167,7 +167,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last) { bool peek_at_off = false; @@ -194,7 +194,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, return NULL; } } - *peeked = 1; refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); @@ -212,7 +211,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * @sk: socket * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue - * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned @@ -246,7 +244,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last) { struct sk_buff_head *queue = &sk->sk_receive_queue; @@ -260,7 +258,6 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - *peeked = 0; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -270,7 +267,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, - peeked, off, &error, last); + off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; @@ -294,7 +291,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err) + int *off, int *err) { struct sk_buff *skb, *last; long timeo; @@ -302,8 +299,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, destructor, peeked, - off, err, &last); + skb = __skb_try_recv_datagram(sk, flags, destructor, off, err, + &last); if (skb) return skb; @@ -319,10 +316,10 @@ EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int noblock, int *err) { - int peeked, off = 0; + int off = 0; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - NULL, &peeked, &off, err); + NULL, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index b430f851f377..22f2640f559a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1184,7 +1184,21 @@ int dev_change_name(struct net_device *dev, const char *newname) BUG_ON(!dev_net(dev)); net = dev_net(dev); - if (dev->flags & IFF_UP) + + /* Some auto-enslaved devices e.g. failover slaves are + * special, as userspace might rename the device after + * the interface had been brought up and running since + * the point kernel initiated auto-enslavement. Allow + * live name change even when these slave devices are + * up and running. + * + * Typically, users of these auto-enslaving devices + * don't actually care about slave name change, as + * they are supposed to operate on master interface + * directly. + */ + if (dev->flags & IFF_UP && + likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; write_seqcount_begin(&devnet_rename_seq); diff --git a/net/core/devlink.c b/net/core/devlink.c index b2715a187a11..7b91605e75d6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1047,14 +1047,15 @@ out: static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type) + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink->ops; if (ops->sb_pool_set) return ops->sb_pool_set(devlink, sb_index, pool_index, - size, threshold_type); + size, threshold_type, extack); return -EOPNOTSUPP; } @@ -1082,7 +1083,8 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); return devlink_sb_pool_set(devlink, devlink_sb->index, - pool_index, size, threshold_type); + pool_index, size, threshold_type, + info->extack); } static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, @@ -1243,14 +1245,15 @@ out: static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, - u32 threshold) + u32 threshold, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; if (ops->sb_port_pool_set) return ops->sb_port_pool_set(devlink_port, sb_index, - pool_index, threshold); + pool_index, threshold, extack); return -EOPNOTSUPP; } @@ -1273,7 +1276,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, - pool_index, threshold); + pool_index, threshold, info->extack); } static int @@ -1472,7 +1475,8 @@ out: static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold) + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; @@ -1480,7 +1484,7 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, if (ops->sb_tc_pool_bind_set) return ops->sb_tc_pool_bind_set(devlink_port, sb_index, tc_index, pool_type, - pool_index, threshold); + pool_index, threshold, extack); return -EOPNOTSUPP; } @@ -1515,7 +1519,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, tc_index, pool_type, - pool_index, threshold); + pool_index, threshold, info->extack); } static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, diff --git a/net/core/failover.c b/net/core/failover.c index 4a92a98ccce9..b5cd3c727285 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) diff --git a/net/core/filter.c b/net/core/filter.c index 8904e3407163..9d28e7e8a4cb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2970,11 +2970,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ - BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; @@ -3009,6 +3012,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; inner_trans = skb->transport_header; } @@ -3017,8 +3022,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return ret; if (encap) { - /* inner mac == inner_net on l3 encap */ - skb->inner_mac_header = inner_net; + skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; skb_set_inner_protocol(skb, skb->protocol); @@ -3032,7 +3036,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; - else + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || @@ -3065,6 +3069,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; + if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + return -EINVAL; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || @@ -4430,8 +4437,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; - if (val) - tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } @@ -4458,6 +4464,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, * Only binding to IP is supported. */ err = -EINVAL; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; @@ -4639,15 +4647,26 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; - if (nhc->nhc_has_gw) - params->ipv4_dst = nhc->nhc_gw.ipv4; params->rt_metric = res.fi->fib_priority; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ - neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (likely(nhc->nhc_gw_family != AF_INET6)) { + if (nhc->nhc_gw_family) + params->ipv4_dst = nhc->nhc_gw.ipv4; + + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + } else { + struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; + + params->family = AF_INET6; + *dst = nhc->nhc_gw.ipv6; + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); + } + if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; @@ -4661,13 +4680,13 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; - struct fib6_info *f6i; struct flowi6 fl6; int strict = 0; - int oif; + int oif, err; u32 mtu; /* link local addresses are never forwarded */ @@ -4709,61 +4728,57 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; - f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, + strict); } else { fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); - f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } - if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || + res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; - if (unlikely(f6i->fib6_flags & RTF_REJECT)) { - switch (f6i->fib6_type) { - case RTN_BLACKHOLE: - return BPF_FIB_LKUP_RET_BLACKHOLE; - case RTN_UNREACHABLE: - return BPF_FIB_LKUP_RET_UNREACHABLE; - case RTN_PROHIBIT: - return BPF_FIB_LKUP_RET_PROHIBIT; - default: - return BPF_FIB_LKUP_RET_NOT_FWDED; - } - } - - if (f6i->fib6_type != RTN_UNICAST) + switch (res.fib6_type) { + /* only unicast is forwarded */ + case RTN_UNICAST: + break; + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: return BPF_FIB_LKUP_RET_NOT_FWDED; + } - if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) - f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, - fl6.flowi6_oif, NULL, - strict); + ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, + fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { - mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) return BPF_FIB_LKUP_RET_FRAG_NEEDED; } - if (f6i->fib6_nh.fib_nh_lws) + if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; - if (f6i->fib6_nh.fib_nh_has_gw) - *dst = f6i->fib6_nh.fib_nh_gw6; + if (res.nh->fib_nh_gw_family) + *dst = res.nh->fib_nh_gw6; - dev = f6i->fib6_nh.fib_nh_dev; - params->rt_metric = f6i->fib6_metric; + dev = res.nh->fib_nh_dev; + params->rt_metric = res.f6i->fib6_metric; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is - * not needed here. Can not use __ipv6_neigh_lookup_noref here - * because we need to get nd_tbl via the stub + * not needed here. */ - neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, - ndisc_hashfn, dst, dev); + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ac679f74ba47..9bf1b9ad1780 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -291,6 +291,7 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + qstats->qlen = 0; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -306,6 +307,7 @@ void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); } else { + qstats->qlen = q->qlen; qstats->backlog = q->backlog; qstats->drops = q->drops; qstats->requeues = q->requeues; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 19b557bd294b..94749e0e2cfd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -26,7 +26,7 @@ #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #ifdef CONFIG_MODULES @@ -223,7 +223,8 @@ void lwtstate_free(struct lwtunnel_state *lws) } EXPORT_SYMBOL_GPL(lwtstate_free); -int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; @@ -236,7 +237,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; - nest = nla_nest_start(skb, RTA_ENCAP); + nest = nla_nest_start(skb, encap_attr); if (!nest) return -EMSGSIZE; @@ -250,7 +251,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) if (ret) goto nla_put_failure; nla_nest_end(skb, nest); - ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 30f6fd8f68e0..997cfa8f99ba 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1920,6 +1920,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + if (tbl->allow_add && !tbl->allow_add(dev, extack)) { + err = -EINVAL; + goto out; + } + neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool exempt_from_gc; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c14f0dc0157c..e4fd68389d6f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1747,20 +1747,16 @@ int netdev_register_kobject(struct net_device *ndev) error = device_add(dev); if (error) - goto error_put_device; + return error; error = register_queue_kobjects(ndev); - if (error) - goto error_device_del; + if (error) { + device_del(dev); + return error; + } pm_runtime_set_memalloc_noio(dev, true); - return 0; - -error_device_del: - device_del(dev); -error_put_device: - put_device(dev); return error; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7e6dcc625701..ebb5b6d21a13 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -839,7 +839,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { - peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b9057478d69c..7e3d0d99dfae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -301,6 +301,4 @@ static int __init init_cgroup_netprio(void) register_netdevice_notifier(&netprio_device_notifier); return 0; } - subsys_initcall(init_cgroup_netprio); -MODULE_LICENSE("GPL v2"); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 703cf76aa7c2..7109c168b5e0 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -185,9 +185,10 @@ void __init ptp_classifier_init(void) { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, }; - struct sock_fprog_kern ptp_prog = { - .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, - }; + struct sock_fprog_kern ptp_prog; + + ptp_prog.len = ARRAY_SIZE(ptp_filter); + ptp_prog.filter = ptp_filter; BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f9b964fd4e4d..5fa5bf3e9945 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4951,7 +4951,7 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < sizeof(*ifsm)) { + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9901f5322852..e89be6282693 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -258,6 +258,33 @@ nodata: } EXPORT_SYMBOL(__alloc_skb); +/* Caller must provide SKB that is memset cleared */ +static struct sk_buff *__build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + struct skb_shared_info *shinfo; + unsigned int size = frag_size ? : ksize(data); + + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + /* Assumes caller memset cleared SKB */ + skb->truesize = SKB_TRUESIZE(size); + refcount_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + return skb; +} + /** * __build_skb - build a network buffer * @data: data buffer provided by caller @@ -279,32 +306,15 @@ EXPORT_SYMBOL(__alloc_skb); */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { - struct skb_shared_info *shinfo; struct sk_buff *skb; - unsigned int size = frag_size ? : ksize(data); skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); - if (!skb) + if (unlikely(!skb)) return NULL; - size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = SKB_TRUESIZE(size); - refcount_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; - - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - return skb; + return __build_skb_around(skb, data, frag_size); } /* build_skb() is wrapper over __build_skb(), that specifically @@ -325,6 +335,29 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +/** + * build_skb_around - build a network buffer around provided skb + * @skb: sk_buff provide by caller, must be memset cleared + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + */ +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + if (unlikely(!skb)) + return NULL; + + skb = __build_skb_around(skb, data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} +EXPORT_SYMBOL(build_skb_around); + #define NAPI_SKB_CACHE_SIZE 64 struct napi_alloc_cache { @@ -5082,7 +5115,8 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { - int mac_len; + int mac_len, meta_len; + void *meta; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); @@ -5094,6 +5128,13 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), mac_len - VLAN_HLEN - ETH_TLEN); } + + meta_len = skb_metadata_len(skb); + if (meta_len) { + meta = skb_metadata_end(skb) - meta_len; + memmove(meta + VLAN_HLEN, meta, meta_len); + } + skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index 782343bb925b..925b84a872dd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -348,7 +348,7 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } - if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); @@ -372,7 +372,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool { struct __kernel_sock_timeval tv; - if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) @@ -2977,39 +2977,44 @@ bool lock_sock_fast(struct sock *sk) } EXPORT_SYMBOL(lock_sock_fast); -int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32) { - struct timeval tv; + struct sock *sk = sock->sk; + struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sock_read_timestamp(sk)); - if (tv.tv_sec == -1) + ts = ktime_to_timespec64(sock_read_timestamp(sk)); + if (ts.tv_sec == -1) return -ENOENT; - if (tv.tv_sec == 0) { + if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - tv = ktime_to_timeval(kt); + sock_write_timestamp(sk, kt);; + ts = ktime_to_timespec64(kt); } - return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; -} -EXPORT_SYMBOL(sock_get_timestamp); -int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct timespec ts; + if (timeval) + ts.tv_nsec /= 1000; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sock_read_timestamp(sk)); - if (ts.tv_sec == -1) - return -ENOENT; - if (ts.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - ts = ktime_to_timespec(sk->sk_stamp); +#ifdef CONFIG_COMPAT_32BIT_TIME + if (time32) + return put_old_timespec32(&ts, userstamp); +#endif +#ifdef CONFIG_SPARC64 + /* beware of padding in sparc64 timeval */ + if (timeval && !in_compat_syscall()) { + struct __kernel_old_timeval __user tv = { + .tv_sec = ts.tv_sec, + .tv_usec = ts.tv_nsec, + }; + if (copy_to_user(userstamp, &tv, sizeof(tv))) + return -EFAULT; + return 0; } - return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +#endif + return put_timespec64(&ts, userstamp); } -EXPORT_SYMBOL(sock_get_timestampns); +EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, int flag) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 26a21d97b6b0..004535e4c070 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -991,6 +991,7 @@ static const struct proto_ops inet_dccp_ops = { /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ .poll = dccp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 57d84e9b7b6f..c4e4d1301062 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1075,6 +1075,7 @@ static const struct proto_ops inet6_dccp_ops = { .getname = inet6_getname, .poll = dccp_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = inet_dccp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 6cd3737593a6..7e47ffdd1412 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -42,7 +42,7 @@ #include <net/dn_fib.h> #include <net/dn_neigh.h> #include <net/dn_dev.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #define RT_MIN_TABLE 1 diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 76338c38738a..19aa32fc1802 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -94,8 +94,6 @@ int dns_query(const char *type, const char *name, size_t namelen, desclen += typelen + 1; } - if (!namelen) - namelen = strnlen(name, 256); if (namelen < 3 || namelen > 255) return -EINVAL; desclen += namelen + 1; diff --git a/net/hsr/Makefile b/net/hsr/Makefile index d74d89d013b0..e45757fc477f 100644 --- a/net/hsr/Makefile +++ b/net/hsr/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_HSR) += hsr.o hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \ hsr_netlink.o hsr_slave.o hsr_forward.o -hsr-$(CONFIG_DEBUG_FS) += hsr_prp_debugfs.o +hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o diff --git a/net/hsr/hsr_prp_debugfs.c b/net/hsr/hsr_debugfs.c index b30e98734c61..94447974a3c0 100644 --- a/net/hsr/hsr_prp_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,9 +1,9 @@ /* - * hsr_prp_debugfs code - * Copyright (C) 2017 Texas Instruments Incorporated + * hsr_debugfs code + * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): - * Murali Karicheri <m-karicheri2@ti.com? + * Murali Karicheri <m-karicheri2@ti.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -26,9 +26,9 @@ static void print_mac_address(struct seq_file *sfp, unsigned char *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -/* hsr_prp_node_table_show - Formats and prints node_table entries */ +/* hsr_node_table_show - Formats and prints node_table entries */ static int -hsr_prp_node_table_show(struct seq_file *sfp, void *data) +hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; @@ -52,40 +52,40 @@ hsr_prp_node_table_show(struct seq_file *sfp, void *data) return 0; } -/* hsr_prp_node_table_open - Open the node_table file +/* hsr_node_table_open - Open the node_table file * * Description: * This routine opens a debugfs file node_table of specific hsr device */ static int -hsr_prp_node_table_open(struct inode *inode, struct file *filp) +hsr_node_table_open(struct inode *inode, struct file *filp) { - return single_open(filp, hsr_prp_node_table_show, inode->i_private); + return single_open(filp, hsr_node_table_show, inode->i_private); } -static const struct file_operations hsr_prp_fops = { +static const struct file_operations hsr_fops = { .owner = THIS_MODULE, - .open = hsr_prp_node_table_open, + .open = hsr_node_table_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; -/* hsr_prp_debugfs_init - create hsr-prp node_table file for dumping +/* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per - * hsr/prp device for dumping the node_table entries + * hsr device for dumping the node_table entries */ -int hsr_prp_debugfs_init(struct hsr_priv *priv) +int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { int rc = -1; struct dentry *de = NULL; - de = debugfs_create_dir("hsr", NULL); + de = debugfs_create_dir(hsr_dev->name, NULL); if (!de) { - pr_err("Cannot create hsr-prp debugfs root\n"); + pr_err("Cannot create hsr debugfs root\n"); return rc; } @@ -93,25 +93,24 @@ int hsr_prp_debugfs_init(struct hsr_priv *priv) de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, - &hsr_prp_fops); + &hsr_fops); if (!de) { - pr_err("Cannot create hsr-prp node_table directory\n"); + pr_err("Cannot create hsr node_table directory\n"); return rc; } priv->node_tbl_file = de; - rc = 0; - return rc; + return 0; } -/* hsr_prp_debugfs_term - Tear down debugfs intrastructure +/* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debufs is configured this routine removes debugfs file system - * elements that are specific to hsr-prp + * elements that are specific to hsr */ void -hsr_prp_debugfs_term(struct hsr_priv *priv) +hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove(priv->node_tbl_file); priv->node_tbl_file = NULL; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index b47a621e3f4e..15c72065df79 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -354,7 +354,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) hsr = netdev_priv(hsr_dev); - hsr_prp_debugfs_term(hsr); + hsr_debugfs_term(hsr); rtnl_lock(); hsr_for_each_port(hsr, port) @@ -485,7 +485,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], goto fail; mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); - res = hsr_prp_debugfs_init(hsr); + res = hsr_debugfs_init(hsr, hsr_dev); if (res) goto fail; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 0cac992192d0..ddd9605bad04 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -359,6 +359,13 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); + /* Gets called for ingress frames as well as egress from master port. + * So check and increment stats for master port only here. + */ + if (port->type == HSR_PT_MASTER) { + port->dev->stats.tx_packets++; + port->dev->stats.tx_bytes += skb->len; + } if (frame.skb_hsr) kfree_skb(frame.skb_hsr); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 778213f07fe0..96fac696a1e1 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -184,15 +184,16 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) -int hsr_prp_debugfs_init(struct hsr_priv *priv); -void hsr_prp_debugfs_term(struct hsr_priv *priv); +int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); +void hsr_debugfs_term(struct hsr_priv *priv); #else -static inline int hsr_prp_debugfs_init(struct hsr_priv *priv) +static inline int hsr_debugfs_init(struct hsr_priv *priv, + struct net_device *hsr_dev) { return 0; } -static inline void hsr_prp_debugfs_term(struct hsr_priv *priv) +static inline void hsr_debugfs_term(struct hsr_priv *priv) {} #endif diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index bc6b912603f1..ce2dfb997537 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -164,10 +164,6 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, struct sock *sk = sock->sk; switch (cmd) { - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, @@ -426,6 +422,7 @@ static const struct proto_ops ieee802154_raw_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, @@ -988,6 +985,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 08a8430f5647..5183a2daba64 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -915,12 +915,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct rtentry rt; switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) @@ -992,6 +986,7 @@ const struct proto_ops inet_stream_ops = { .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1027,6 +1022,7 @@ const struct proto_ops inet_dgram_ops = { .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1059,6 +1055,7 @@ static const struct proto_ops inet_sockraw_ops = { .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 1e976bb93d99..15427163a041 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -77,5 +77,4 @@ static int __init bpfilter_sockopt_init(void) return 0; } - -module_init(bpfilter_sockopt_init); +device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 15f779bd26b3..d4b63f94f7be 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; - cfg->fc_gw = addr; + cfg->fc_gw4 = addr; + cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) @@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (cmd == SIOCDELRT) return 0; - if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) + if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) @@ -664,10 +665,55 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DPORT] = { .type = NLA_U16 }, }; +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct rtvia *via; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); + return -EINVAL; + } + + via = nla_data(nla); + alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (alen != sizeof(__be32)) { + NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET; + cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); + break; + case AF_INET6: +#ifdef CONFIG_IPV6 + if (alen != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET6; + cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); +#else + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EINVAL; +#endif + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); + return -EINVAL; + } + + return 0; +} + static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { + bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; @@ -708,12 +754,17 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: - cfg->fc_gw = nla_get_be32(attr); + has_gw = true; + cfg->fc_gw4 = nla_get_be32(attr); + if (cfg->fc_gw4) + cfg->fc_gw_family = AF_INET; break; case RTA_VIA: - NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); - err = -EINVAL; - goto errout; + has_via = true; + err = fib_gw_from_via(cfg, attr, extack); + if (err) + goto errout; + break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; @@ -752,6 +803,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (has_gw && has_via) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + goto errout; + } + return 0; errout: return err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8e0cb1687a74..4336f1ec8ab0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -41,8 +41,9 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/ip6_fib.h> #include <net/netlink.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/addrconf.h> @@ -276,7 +277,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) for_nexthops(fi) { if (nh->fib_nh_oif != onh->fib_nh_oif || - nh->fib_nh_gw4 != onh->fib_nh_gw4 || + nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || @@ -287,6 +288,15 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; + + if (nh->fib_nh_gw_family == AF_INET && + nh->fib_nh_gw4 != onh->fib_nh_gw4) + return -1; + + if (nh->fib_nh_gw_family == AF_INET6 && + ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) + return -1; + onh++; } endfor_nexthops(fi); return 0; @@ -447,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev); + if (likely(nhc->nhc_gw_family == AF_INET)) + n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); + else if (nhc->nhc_gw_family == AF_INET6) + n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, + nhc->nhc_dev); + else + n = NULL; + if (n) { state = n->nud_state; neigh_release(n); @@ -511,10 +529,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, goto init_failure; nh->fib_nh_oif = cfg->fc_oif; - if (cfg->fc_gw) { - nh->fib_nh_gw4 = cfg->fc_gw; - nh->fib_nh_has_gw = 1; - } + nh->fib_nh_gw_family = cfg->fc_gw_family; + if (cfg->fc_gw_family == AF_INET) + nh->fib_nh_gw4 = cfg->fc_gw4; + else if (cfg->fc_gw_family == AF_INET6) + nh->fib_nh_gw6 = cfg->fc_gw6; + nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID @@ -586,11 +606,24 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla) - fib_cfg.fc_gw = nla_get_in_addr(nla); + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + if (nla) { + fib_cfg.fc_gw4 = nla_get_in_addr(nla); + if (fib_cfg.fc_gw4) + fib_cfg.fc_gw_family = AF_INET; + } else if (nlav) { + ret = fib_gw_from_via(&fib_cfg, nlav, extack); + if (ret) + goto errout; + } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) @@ -616,10 +649,16 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, "Nexthop device index does not match RTA_OIF"); goto errout; } - if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) { - NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); - goto errout; + if (cfg->fc_gw_family) { + if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + (cfg->fc_gw_family == AF_INET && + fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); + goto errout; + } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { @@ -719,7 +758,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; - if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_oif || cfg->fc_gw_family) { if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, fi->fib_nh, cfg, extack)) @@ -730,10 +769,20 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_flow != fi->fib_nh->nh_tclassid) return 1; #endif - if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) && - (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4)) - return 0; - return 1; + if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + (cfg->fc_gw_family && + cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + return 1; + + if (cfg->fc_gw_family == AF_INET && + cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + return 1; + + if (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + return 1; + + return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -754,11 +803,43 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4) - return 1; + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + + if (nla) { + if (nh->fib_nh_gw_family != AF_INET || + nla_get_in_addr(nla) != nh->fib_nh_gw4) + return 1; + } else if (nlav) { + struct fib_config cfg2; + int err; + + err = fib_gw_from_via(&cfg2, nlav, extack); + if (err) + return err; + + switch (nh->fib_nh_gw_family) { + case AF_INET: + if (cfg2.fc_gw_family != AF_INET || + cfg2.fc_gw4 != nh->fib_nh_gw4) + return 1; + break; + case AF_INET6: + if (cfg2.fc_gw_family != AF_INET6 || + ipv6_addr_cmp(&cfg2.fc_gw6, + &nh->fib_nh_gw6)) + return 1; + break; + } + } + #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) @@ -812,6 +893,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) return true; } +static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, + u32 table, struct netlink_ext_ack *extack) +{ + struct fib6_config cfg = { + .fc_table = table, + .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, + .fc_ifindex = nh->fib_nh_oif, + .fc_gateway = nh->fib_nh_gw6, + }; + struct fib6_nh fib6_nh = {}; + int err; + + err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); + if (!err) { + nh->fib_nh_dev = fib6_nh.fib_nh_dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_oif = nh->fib_nh_dev->ifindex; + nh->fib_nh_scope = RT_SCOPE_LINK; + + ipv6_stub->fib6_nh_release(&fib6_nh); + } + + return err; +} /* * Picture @@ -856,134 +961,152 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, + u8 scope, struct netlink_ext_ack *extack) { - int err = 0; - struct net *net; struct net_device *dev; + struct fib_result res; + int err; - net = cfg->fc_nlinfo.nl_net; - if (nh->fib_nh_gw4) { - struct fib_result res; - - if (nh->fib_nh_flags & RTNH_F_ONLINK) { - unsigned int addr_type; + if (nh->fib_nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid scope"); - return -EINVAL; - } - dev = __dev_get_by_index(net, nh->fib_nh_oif); - if (!dev) { - NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); - return -ENODEV; - } - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, - "Nexthop device is not up"); - return -ENETDOWN; - } - addr_type = inet_addr_type_dev_table(net, dev, - nh->fib_nh_gw4); - if (addr_type != RTN_UNICAST) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - return -EINVAL; - } - if (!netif_carrier_ok(dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - nh->fib_nh_dev = dev; - dev_hold(dev); - nh->fib_nh_scope = RT_SCOPE_LINK; - return 0; + if (scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); + return -EINVAL; } - rcu_read_lock(); - { - struct fib_table *tbl = NULL; - struct flowi4 fl4 = { - .daddr = nh->fib_nh_gw4, - .flowi4_scope = cfg->fc_scope + 1, - .flowi4_oif = nh->fib_nh_oif, - .flowi4_iif = LOOPBACK_IFINDEX, - }; - - /* It is not necessary, but requires a bit of thinking */ - if (fl4.flowi4_scope < RT_SCOPE_LINK) - fl4.flowi4_scope = RT_SCOPE_LINK; - - if (cfg->fc_table) - tbl = fib_get_table(net, cfg->fc_table); - - if (tbl) - err = fib_table_lookup(tbl, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE | - FIB_LOOKUP_NOREF); - - /* on error or if no table given do full lookup. This - * is needed for example when nexthops are in the local - * table rather than the given table - */ - if (!tbl || err) { - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); - } - - if (err) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - rcu_read_unlock(); - return err; - } + dev = __dev_get_by_index(net, nh->fib_nh_oif); + if (!dev) { + NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); + return -ENODEV; } - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { - NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); - goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; } - nh->fib_nh_scope = res.scope; - nh->fib_nh_oif = FIB_RES_OIF(res); - nh->fib_nh_dev = dev = FIB_RES_DEV(res); - if (!dev) { - NL_SET_ERR_MSG(extack, - "No egress device for nexthop gateway"); - goto out; + addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + return -EINVAL; } - dev_hold(dev); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; - } else { - struct in_device *in_dev; - - if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { - NL_SET_ERR_MSG(extack, - "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); - return -EINVAL; + nh->fib_nh_dev = dev; + dev_hold(dev); + nh->fib_nh_scope = RT_SCOPE_LINK; + return 0; + } + rcu_read_lock(); + { + struct fib_table *tbl = NULL; + struct flowi4 fl4 = { + .daddr = nh->fib_nh_gw4, + .flowi4_scope = scope + 1, + .flowi4_oif = nh->fib_nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + + if (table) + tbl = fib_get_table(net, table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); } - rcu_read_lock(); - err = -ENODEV; - in_dev = inetdev_by_index(net, nh->fib_nh_oif); - if (!in_dev) - goto out; - err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } - nh->fib_nh_dev = in_dev->dev; - dev_hold(nh->fib_nh_dev); - nh->fib_nh_scope = RT_SCOPE_HOST; - if (!netif_carrier_ok(nh->fib_nh_dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = 0; } + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + goto out; + } + nh->fib_nh_scope = res.scope; + nh->fib_nh_oif = FIB_RES_OIF(res); + nh->fib_nh_dev = dev = FIB_RES_DEV(res); + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); + goto out; + } + dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; +out: + rcu_read_unlock(); + return err; +} + +static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct in_device *in_dev; + int err; + + if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); + return -EINVAL; + } + + rcu_read_lock(); + + err = -ENODEV; + in_dev = inetdev_by_index(net, nh->fib_nh_oif); + if (!in_dev) + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + goto out; + } + + nh->fib_nh_dev = in_dev->dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->fib_nh_dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = 0; out: rcu_read_unlock(); return err; } +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + u32 table = cfg->fc_table; + int err; + + if (nh->fib_nh_gw_family == AF_INET) + err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + else if (nh->fib_nh_gw_family == AF_INET6) + err = fib_check_nh_v6_gw(net, nh, table, extack); + else + err = fib_check_nh_nongw(net, nh, extack); + + return err; +} + static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_info_hash_size - 1); @@ -1204,7 +1327,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; @@ -1238,7 +1361,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Route with host scope can not have multiple nexthops"); goto err_inval; } - if (nh->fib_nh_gw4) { + if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; @@ -1269,6 +1392,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); @@ -1319,7 +1444,7 @@ failure: } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, - unsigned int *flags, bool skip_oif) + unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; @@ -1341,18 +1466,32 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, rcu_read_unlock(); } - if (nhc->nhc_has_gw) { - switch (nhc->nhc_family) { - case AF_INET: - if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) - goto nla_put_failure; - break; - case AF_INET6: - if (nla_put_in6_addr(skb, RTA_GATEWAY, - &nhc->nhc_gw.ipv6) < 0) + switch (nhc->nhc_gw_family) { + case AF_INET: + if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) + goto nla_put_failure; + break; + case AF_INET6: + /* if gateway family does not match nexthop family + * gateway is encoded as RTA_VIA + */ + if (nhc->nhc_gw_family != nhc->nhc_family) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) goto nla_put_failure; - break; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); + } else if (nla_put_in6_addr(skb, RTA_GATEWAY, + &nhc->nhc_gw.ipv6) < 0) { + goto nla_put_failure; } + break; } *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); @@ -1364,7 +1503,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, goto nla_put_failure; if (nhc->nhc_lwtstate && - lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0) + lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, + RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; @@ -1380,7 +1520,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; - unsigned int flags = 0; + unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) @@ -1479,7 +1619,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (fi->fib_nhs == 1) { struct fib_nh *nh = &fi->fib_nh[0]; - unsigned int flags = 0; + unsigned char flags = 0; if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) goto nla_put_failure; @@ -1762,7 +1902,7 @@ out: * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev, unsigned int nh_flags) +int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1832,8 +1972,14 @@ static bool fib_good_nh(const struct fib_nh *nh) rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, - (__force u32)nh->fib_nh_gw4); + if (likely(nh->fib_nh_gw_family == AF_INET)) + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + else if (nh->fib_nh_gw_family == AF_INET6) + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, + &nh->fib_nh_gw6); + else + n = NULL; if (n) state = n->nud_state; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 100e63f57ea6..1ca1586a7e46 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -121,6 +121,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) struct guehdr *guehdr; void *data; u16 doffset = 0; + u8 proto_ctype; if (!fou) return 1; @@ -136,7 +137,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ int prot; @@ -170,9 +171,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - hdrlen = sizeof(struct guehdr) + optlen; - - if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) + if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; @@ -212,13 +211,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); + proto_ctype = guehdr->proto_ctype; __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); if (iptunnel_pull_offloads(skb)) goto drop; - return -guehdr->proto_ctype; + return -proto_ctype; drop: kfree_skb(skb); @@ -1137,7 +1137,7 @@ static int gue_err(struct sk_buff *skb, u32 info) case 0: /* Full GUE header present */ break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6ea523d71947..a175e3e7ae97 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..06f6f280b9ff 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_uses_gateway) + if (opt->is_strictroute && rt->rt_gw_family) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fd219f7bd3ea..4b0526441476 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -282,9 +281,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), @@ -292,8 +288,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, goto drop; if (tunnel->collect_md) { + struct erspan_metadata *pkt_md, *md; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -306,6 +303,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); md = ip_tunnel_info_opts(&tun_dst->u.tun_info); md->version = ver; md2 = &md->u.md2; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10b35328cfbc..4e42c1974ba2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; - u32 nexthop; + bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); @@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } rcu_read_lock_bh(); - nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); - res = neigh_output(neigh, skb); - + /* if crossing protocols, can not use the cached header */ + res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock_bh(); return res; } @@ -472,7 +469,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9a3f13edc98e..a8eb97777c0a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -66,7 +66,7 @@ #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <linux/nospec.h> diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c98391d49200..1412b029f37f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -27,14 +27,6 @@ config NF_TABLES_IPV4 if NF_TABLES_IPV4 -config NFT_CHAIN_ROUTE_IPV4 - tristate "IPv4 nf_tables route chain support" - help - This option enables the "route" chain for IPv4 in nf_tables. This - chain type is used to force packet re-routing after mangling header - fields such as the source, destination, type of service and - the packet mark. - config NFT_REJECT_IPV4 select NF_REJECT_IPV4 default NFT_REJECT @@ -232,16 +224,10 @@ if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE - default m if NETFILTER_ADVANCED=n + select NETFILTER_XT_TARGET_MASQUERADE help - Masquerading is a special case of NAT: all outgoing connections are - changed to seem to come from a particular interface's address, and - if the interface goes down, those connections are lost. This is - only useful for dialup accounts with dynamic IP address (ie. your IP - address will be different on next dialup). - - To compile it as a module, choose M here. If unsure, say N. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP_NF_TARGET_NETMAP tristate "NETMAP target support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index e241f5188ebe..c50e0ec095d2 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -24,7 +24,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o @@ -49,7 +48,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o -obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c deleted file mode 100644 index 7d82934c46f4..000000000000 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/route.h> -#include <net/ip.h> - -static unsigned int nf_route_table_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - struct nft_pktinfo pkt; - u32 mark; - __be32 saddr, daddr; - u_int8_t tos; - const struct iphdr *iph; - int err; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - mark = skb->mark; - iph = ip_hdr(skb); - saddr = iph->saddr; - daddr = iph->daddr; - tos = iph->tos; - - ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_STOLEN) { - iph = ip_hdr(skb); - - if (iph->saddr != saddr || - iph->daddr != daddr || - skb->mark != mark || - iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } - return ret; -} - -static const struct nft_chain_type nft_chain_route_ipv4 = { - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, -}; - -static int __init nft_chain_route_init(void) -{ - nft_register_chain_type(&nft_chain_route_ipv4); - - return 0; -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv4); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3f2adf630d4..4950adeb05c0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -434,37 +434,46 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; - const struct rtable *rt; struct neighbour *n; - rt = (const struct rtable *) dst; - if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; - else if (skb) - pkey = &ip_hdr(skb)->daddr; + rcu_read_lock_bh(); + + if (likely(rt->rt_gw_family == AF_INET)) { + n = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + n = ip_neigh_gw6(dev, &rt->rt_gw6); + } else { + __be32 pkey; + + pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); + n = ip_neigh_gw4(dev, pkey); + } + + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; + + rcu_read_unlock_bh(); - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; - rt = (const struct rtable *)dst; - if (rt->rt_gateway) - pkey = (const __be32 *)&rt->rt_gateway; - else if (!daddr || + if (rt->rt_gw_family == AF_INET) { + pkey = (const __be32 *)&rt->rt_gw4; + } else if (rt->rt_gw_family == AF_INET6) { + return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); + } else if (!daddr || (rt->rt_flags & - (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; - + } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } @@ -629,8 +638,8 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; - rt->rt_uses_gateway = 1; + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = fnhe->fnhe_gw; } } @@ -747,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow return; } - if (rt->rt_gateway != old_gw) + if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); @@ -1189,11 +1198,39 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return dst; } +static void ipv4_send_dest_unreach(struct sk_buff *skb) +{ + struct ip_options opt; + int res; + + /* Recompile ip options since IPCB may not be valid anymore. + * Also check we have a reasonable ipv4 header. + */ + if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || + ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) + return; + + memset(&opt, 0, sizeof(opt)); + if (ip_hdr(skb)->ihl > 5) { + if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) + return; + opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + rcu_read_unlock(); + + if (res) + return; + } + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); +} + static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) @@ -1282,7 +1319,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) + if (rt->rt_gw_family && mtu > 576) mtu = 576; } @@ -1410,8 +1447,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, orig = NULL; } fill_route_from_fnhe(rt, fnhe); - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw4 = daddr; + rt->rt_gw_family = AF_INET; + } if (do_cache) { dst_hold(&rt->dst); @@ -1535,14 +1574,20 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); + struct fib_nh *nh; - if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->fib_nh_gw4; - rt->rt_uses_gateway = 1; + if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_gw_family = nhc->nhc_gw_family; + /* only INET and INET6 are supported */ + if (likely(nhc->nhc_gw_family == AF_INET)) + rt->rt_gw4 = nhc->nhc_gw.ipv4; + else + rt->rt_gw6 = nhc->nhc_gw.ipv6; } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1557,8 +1602,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = daddr; + } rt_add_uncached_list(rt); } } else @@ -1591,8 +1638,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; - rt->rt_gateway = 0; - rt->rt_uses_gateway = 0; + rt->rt_gw_family = 0; + rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1734,8 +1781,9 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { - __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + __be32 gw; + gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; @@ -2284,7 +2332,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nhc->nhc_has_gw && + !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; @@ -2594,8 +2642,11 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_gateway = ort->rt_gateway; - rt->rt_uses_gateway = ort->rt_uses_gateway; + rt->rt_gw_family = ort->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + rt->rt_gw4 = ort->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2674,9 +2725,22 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_uses_gateway && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } expires = rt->dst.expires; if (expires) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2316c08e9591..875867b64d6a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; +static int one_day_secs = 24 * 3600; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1160,7 +1161,9 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 603e770d59b3..f7567a3698eb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -868,7 +868,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(!size)) { skb = sk->sk_tx_skb_cache; if (skb && !skb_cloned(skb)) { - skb->truesize -= skb->data_len; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 359da68d7c06..477cb4aa456c 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -49,9 +49,8 @@ #define DCTCP_MAX_ALPHA 1024U struct dctcp { - u32 acked_bytes_ecn; - u32 acked_bytes_total; - u32 prior_snd_una; + u32 old_delivered; + u32 old_delivered_ce; u32 prior_rcv_nxt; u32 dctcp_alpha; u32 next_seq; @@ -73,8 +72,8 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; - ca->acked_bytes_ecn = 0; - ca->acked_bytes_total = 0; + ca->old_delivered = tp->delivered; + ca->old_delivered_ce = tp->delivered_ce; } static void dctcp_init(struct sock *sk) @@ -86,7 +85,6 @@ static void dctcp_init(struct sock *sk) sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); - ca->prior_snd_una = tp->snd_una; ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); @@ -118,37 +116,25 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); - u32 acked_bytes = tp->snd_una - ca->prior_snd_una; - - /* If ack did not advance snd_una, count dupack as MSS size. - * If ack did update window, do not count it at all. - */ - if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) - acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; - if (acked_bytes) { - ca->acked_bytes_total += acked_bytes; - ca->prior_snd_una = tp->snd_una; - - if (flags & CA_ACK_ECE) - ca->acked_bytes_ecn += acked_bytes; - } /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - u64 bytes_ecn = ca->acked_bytes_ecn; + u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); - if (bytes_ecn) { + if (delivered_ce) { + u32 delivered = tp->delivered - ca->old_delivered; + /* If dctcp_shift_g == 1, a 32bit value would overflow - * after 8 Mbytes. + * after 8 M packets. */ - bytes_ecn <<= (10 - dctcp_shift_g); - do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + delivered_ce <<= (10 - dctcp_shift_g); + delivered_ce /= max(1U, delivered); - alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); } /* dctcp_alpha can be read from dctcp_get_info() without * synchro, so we ask compiler to not use dctcp_alpha @@ -200,6 +186,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Fill it also in case of VEGASINFO due to req struct limits. * We can still correctly retrieve it later. @@ -211,8 +198,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_alpha = ca->dctcp_alpha; - info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; - info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_ab_ecn = tp->mss_cache * + (tp->delivered_ce - ca->old_delivered_ce); + info->dctcp.dctcp_ab_tot = tp->mss_cache * + (tp->delivered - ca->old_delivered); } *attr = INET_DIAG_DCTCPINFO; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6660ce2a7333..97671bff597a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -402,11 +402,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + int room; + + room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_under_memory_pressure(sk)) { + if (room > 0 && !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -419,8 +420,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, - tp->window_clamp); + tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 372fdc5381a9..3c58ba02af7d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1631,7 +1631,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) EXPORT_SYMBOL(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *peeked, int *off, int *err) + int noblock, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; @@ -1650,13 +1650,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, break; error = -EAGAIN; - *peeked = 0; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_destructor, - peeked, off, err, - &last); + off, err, &last); if (skb) { spin_unlock_bh(&queue->lock); return skb; @@ -1677,8 +1675,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_dtor_locked, - peeked, off, err, - &last); + off, err, &last); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) @@ -1713,8 +1710,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, peeking, off; - int err; + int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -1722,9 +1718,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = flags & MSG_PEEK; off = sk_peek_offset(sk, flags); - skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; @@ -1762,7 +1757,7 @@ try_again: } if (unlikely(err)) { - if (!peeked) { + if (!peeking) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); @@ -1771,7 +1766,7 @@ try_again: return err; } - if (!peeked) + if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..72d19b1838ed 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -97,8 +97,11 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; + xdst->u.rt.rt_gw_family = rt->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + xdst->u.rt.rt_gw4 = rt->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2e8d1d2d8d3d..340a0f06f974 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2421,7 +2421,7 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) continue; - if (no_gw && rt->fib6_nh.fib_nh_has_gw) + if (no_gw && rt->fib6_nh.fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 945b66e3008f..763a947e0d14 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -144,43 +144,53 @@ static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) return NULL; } -static struct fib6_info * +static int eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - return NULL; + return -EAFNOSUPPORT; } -static struct fib6_info * +static int eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) + struct fib6_result *res, int flags) { - return NULL; + return -EAFNOSUPPORT; } -static struct fib6_info * -eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, int strict) +static void +eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict) { - return f6i; } static u32 -eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr) +eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { return 0; } +static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, - .fib6_multipath_select = eafnosupport_fib6_multipath_select, + .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, + .fib6_nh_init = eafnosupport_fib6_nh_init, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index d43d076c98f5..1766325423b5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -476,7 +476,7 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, } if (nlmsg_attrlen(nlh, sizeof(*ifal))) { - NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst"); + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request"); return -EINVAL; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1789bf99c419..c04ae282f4e4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -547,12 +547,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct net *net = sock_net(sk); switch (cmd) { - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - case SIOCADDRT: case SIOCDELRT: @@ -585,6 +579,7 @@ const struct proto_ops inet6_stream_ops = { .getname = inet6_getname, .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ @@ -618,6 +613,7 @@ const struct proto_ops inet6_dgram_ops = { .getname = inet6_getname, .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ @@ -850,6 +846,15 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; + + /* By default, rate limit error messages. + * Except for pmtu discovery, it would break it. + * proc_do_large_bitmap needs pointer to the bitmap. + */ + bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); + bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); + net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; + net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; @@ -917,8 +922,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, - .fib6_multipath_select = fib6_multipath_select, + .fib6_select_path = fib6_select_path, .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, + .fib6_nh_init = fib6_nh_init, + .fib6_nh_release = fib6_nh_release, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f590446595d8..06d1b7763600 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,16 +61,16 @@ unsigned int fib6_rules_seq_read(struct net *net) } /* called with rcu lock held; no reference taken on fib6_info */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - struct fib6_info *f6i; int err; if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = fib6_table_lookup, .lookup_data = &oif, + .result = res, .flags = FIB_LOOKUP_NOREF, }; @@ -78,19 +78,15 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, err = fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); - if (err) - return ERR_PTR(err); - - f6i = arg.result ? : net->ipv6.fib6_null_entry; } else { - f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, - oif, fl6, flags); - if (!f6i || f6i == net->ipv6.fib6_null_entry) - f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, - oif, fl6, flags); + err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif, + fl6, res, flags); + if (err || res->f6i == net->ipv6.fib6_null_entry) + err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, + oif, fl6, res, flags); } - return f6i; + return err; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, @@ -98,9 +94,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { + struct fib6_result res = {}; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .lookup_data = skb, + .result = &res, .flags = FIB_LOOKUP_NOREF, }; @@ -110,8 +108,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); - if (arg.result) - return arg.result; + if (res.rt6) + return &res.rt6->dst; } else { struct rt6_info *rt; @@ -157,11 +155,11 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct net *net = rule->fr_net; struct fib6_table *table; - struct fib6_info *f6i; - int err = -EAGAIN, *oif; + int err, *oif; u32 tb_id; switch (rule->action) { @@ -182,14 +180,12 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, return -EAGAIN; oif = (int *)arg->lookup_data; - f6i = fib6_table_lookup(net, table, *oif, flp6, flags); - if (f6i != net->ipv6.fib6_null_entry) { + err = fib6_table_lookup(net, table, *oif, flp6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) err = fib6_rule_saddr(net, rule, flags, flp6, - fib6_info_nh_dev(f6i)); - - if (likely(!err)) - arg->result = f6i; - } + res->nh->fib_nh_dev); + else + err = -EAGAIN; return err; } @@ -197,6 +193,7 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; @@ -251,7 +248,7 @@ again: discard_pkt: dst_hold(&rt->dst); out: - arg->result = rt; + res->rt6 = rt; return err; } @@ -266,9 +263,13 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { - struct rt6_info *rt = (struct rt6_info *) arg->result; + struct fib6_result *res = arg->result; + struct rt6_info *rt = res->rt6; struct net_device *dev = NULL; + if (!rt) + return false; + if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index cc14b9998941..afb915807cd0 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -168,22 +168,21 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } -static bool icmpv6_mask_allow(int type) +static bool icmpv6_mask_allow(struct net *net, int type) { - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) + if (type > ICMPV6_MSG_MAX) return true; - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + /* Limit if icmp type is set in ratemask. */ + if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } -static bool icmpv6_global_allow(int type) +static bool icmpv6_global_allow(struct net *net, int type) { - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow()) @@ -202,7 +201,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; /* @@ -511,7 +510,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; mip6_addr_swap(skb); @@ -731,6 +730,11 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; + /* Check the ratelimit */ + if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + goto out_dst_release; + idev = __in6_dev_get(skb->dev); msg.skb = skb; @@ -751,6 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); } +out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); @@ -1137,6 +1142,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ratemask", + .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, + .maxlen = ICMPV6_MSG_MAX + 1, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { }, }; @@ -1153,6 +1165,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; + table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; } return table; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8c00609a1513..a8919c217cc2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -162,7 +162,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) } INIT_LIST_HEAD(&f6i->fib6_siblings); - atomic_inc(&f6i->fib6_ref); + refcount_set(&f6i->fib6_ref, 1); return f6i; } @@ -175,10 +175,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) WARN_ON(f6i->fib6_node); bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - if (bucket) { - f6i->rt6i_exception_bucket = NULL; - kfree(bucket); - } + kfree(bucket); if (f6i->rt6i_pcpu) { int cpu; @@ -354,10 +351,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } /* called with rcu lock held; no reference taken on fib6_info */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags); + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, + res, flags); } static void __net_init fib6_tables_init(struct net *net) @@ -848,8 +846,8 @@ insert_above: RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&rcu_dereference_protected(in->leaf, - lockdep_is_held(&table->tb6_lock))->fib6_ref); + fib6_info_hold(rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) @@ -931,7 +929,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; - if (atomic_read(&rt->fib6_ref) != 1) { + if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes @@ -944,7 +942,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); - atomic_inc(&new_leaf->fib6_ref); + fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); @@ -1110,7 +1108,7 @@ add: return err; rcu_assign_pointer(rt->fib6_next, iter); - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) @@ -1138,7 +1136,7 @@ add: if (err) return err; - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); @@ -1280,7 +1278,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref); + fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; @@ -1323,7 +1321,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } @@ -2304,7 +2302,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_has_gw) { + if (rt->fib6_nh.fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); } else { @@ -2313,7 +2311,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) dev = rt->fib6_nh.fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->fib6_metric, atomic_read(&rt->fib6_ref), 0, + rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b32c95f02128..655e46b227f9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -525,10 +525,10 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) } static int ip6erspan_rcv(struct sk_buff *skb, - struct tnl_ptk_info *tpi) + struct tnl_ptk_info *tpi, + int gre_hdr_len) { struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; const struct ipv6hdr *ipv6h; struct erspan_md2 *md2; struct ip6_tnl *tunnel; @@ -547,18 +547,16 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)skb->data; - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) return PACKET_REJECT; if (tunnel->parms.collect_md) { + struct erspan_metadata *pkt_md, *md; struct metadata_dst *tun_dst; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -571,6 +569,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); md->version = ver; @@ -607,7 +613,7 @@ static int gre_rcv(struct sk_buff *skb) if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { - if (ip6erspan_rcv(skb, &tpi) == PACKET_RCVD) + if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e51f3c648b09..adef2236abe2 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -117,7 +117,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); - ret = neigh_output(neigh, skb); + ret = neigh_output(neigh, skb, false); rcu_read_unlock_bh(); return ret; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 66c8b294e02b..4c8e2ea8bf19 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -77,6 +77,8 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); +static bool ndisc_allow_add(const struct net_device *dev, + struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -117,6 +119,7 @@ struct neigh_table nd_tbl = { .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, + .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, @@ -392,6 +395,20 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } +/* called with rtnl held */ +static bool ndisc_allow_add(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev || idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); + return false; + } + + return true; +} + static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ddc99a1653aa..086fc669279e 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -23,14 +23,6 @@ config NF_TABLES_IPV6 if NF_TABLES_IPV6 -config NFT_CHAIN_ROUTE_IPV6 - tristate "IPv6 nf_tables route chain support" - help - This option enables the "route" chain for IPv6 in nf_tables. This - chain type is used to force packet re-routing after mangling header - fields such as the source, destination, flowlabel, hop-limit and - the packet mark. - config NFT_REJECT_IPV6 select NF_REJECT_IPV6 default NFT_REJECT @@ -278,15 +270,10 @@ if IP6_NF_NAT config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE + select NETFILTER_XT_TARGET_MASQUERADE help - Masquerading is a special case of NAT: all outgoing connections are - changed to seem to come from a particular interface's address, and - if the interface goes down, those connections are lost. This is - only useful for dialup accounts with dynamic IP address (ie. your IP - address will be different on next dialup). - - To compile it as a module, choose M here. If unsure, say N. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP6_NF_TARGET_NPT tristate "NPT (Network Prefix translation) target support" diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 3853c648ebaa..731a74c60dca 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,7 +27,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o # nf_tables -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o @@ -47,7 +46,6 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets -obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c deleted file mode 100644 index 29c7f1915a96..000000000000 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 - * NAT funded by Astaro. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/ipv6.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/addrconf.h> -#include <net/ipv6.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> - -static unsigned int -masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) -{ - return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); -} - -static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) -{ - const struct nf_nat_range2 *range = par->targinfo; - - if (range->flags & NF_NAT_RANGE_MAP_IPS) - return -EINVAL; - return nf_ct_netns_get(par->net, par->family); -} - -static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) -{ - nf_ct_netns_put(par->net, par->family); -} - -static struct xt_target masquerade_tg6_reg __read_mostly = { - .name = "MASQUERADE", - .family = NFPROTO_IPV6, - .checkentry = masquerade_tg6_checkentry, - .destroy = masquerade_tg6_destroy, - .target = masquerade_tg6, - .targetsize = sizeof(struct nf_nat_range), - .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, - .me = THIS_MODULE, -}; - -static int __init masquerade_tg6_init(void) -{ - int err; - - err = xt_register_target(&masquerade_tg6_reg); - if (err) - return err; - - err = nf_nat_masquerade_ipv6_register_notifier(); - if (err) - xt_unregister_target(&masquerade_tg6_reg); - - return err; -} -static void __exit masquerade_tg6_exit(void) -{ - nf_nat_masquerade_ipv6_unregister_notifier(); - xt_unregister_target(&masquerade_tg6_reg); -} - -module_init(masquerade_tg6_init); -module_exit(masquerade_tg6_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: automatic address SNAT"); diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c deleted file mode 100644 index da3f1f8cb325..000000000000 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv6.h> -#include <net/route.h> - -static unsigned int nf_route_table_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - struct nft_pktinfo pkt; - struct in6_addr saddr, daddr; - u_int8_t hop_limit; - u32 mark, flowlabel; - int err; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); - - /* save source/dest address, mark, hoplimit, flowlabel, priority */ - memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); - memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); - mark = skb->mark; - hop_limit = ipv6_hdr(skb)->hop_limit; - - /* flowlabel and prio (includes version, which shouldn't change either */ - flowlabel = *((u32 *)ipv6_hdr(skb)); - - ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_STOLEN && - (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || - memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || - skb->mark != mark || - ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { - err = ip6_route_me_harder(state->net, skb); - if (err < 0) - ret = NF_DROP_ERR(err); - } - - return ret; -} - -static const struct nft_chain_type nft_chain_route_ipv6 = { - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .family = NFPROTO_IPV6, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, -}; - -static int __init nft_chain_route_init(void) -{ - nft_register_chain_type(&nft_chain_route_ipv6); - - return 0; -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv6); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route"); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5a426226c762..84dbe21b71e5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1356,6 +1356,7 @@ const struct proto_ops inet6_sockraw_ops = { .getname = inet6_getname, .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6e89151693d0..9c0127a44f9f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -59,7 +59,7 @@ #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> @@ -102,14 +102,15 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict); static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr); @@ -295,7 +296,7 @@ static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, - .fib6_ref = ATOMIC_INIT(1), + .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; @@ -427,13 +428,15 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -struct fib6_info *fib6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +void fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict) { struct fib6_info *sibling, *next_sibling; + struct fib6_info *match = res->f6i; + + if (!match->fib6_nsiblings || have_oif_match) + goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -442,60 +445,88 @@ struct fib6_info *fib6_multipath_select(const struct net *net, fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) - return match; + goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { + const struct fib6_nh *nh = &sibling->fib6_nh; int nh_upper_bound; - nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound); + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (fl6->mp_hash > nh_upper_bound) continue; - if (rt6_score_route(sibling, oif, strict) < 0) + if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } - return match; +out: + res->f6i = match; + res->nh = &match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ -static inline struct fib6_info *rt6_device_match(struct net *net, - struct fib6_info *rt, - const struct in6_addr *saddr, - int oif, - int flags) +static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, + const struct in6_addr *saddr, int oif, int flags) { - struct fib6_info *sprt; + const struct net_device *dev; - if (!oif && ipv6_addr_any(saddr) && - !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)) - return rt; + if (nh->fib_nh_flags & RTNH_F_DEAD) + return false; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { - const struct net_device *dev = sprt->fib6_nh.fib_nh_dev; + dev = nh->fib_nh_dev; + if (oif) { + if (dev->ifindex == oif) + return true; + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return true; + } - if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) - continue; + return false; +} - if (oif) { - if (dev->ifindex == oif) - return sprt; - } else { - if (ipv6_chk_addr(net, saddr, dev, - flags & RT6_LOOKUP_F_IFACE)) - return sprt; +static void rt6_device_match(struct net *net, struct fib6_result *res, + const struct in6_addr *saddr, int oif, int flags) +{ + struct fib6_info *f6i = res->f6i; + struct fib6_info *spf6i; + struct fib6_nh *nh; + + if (!oif && ipv6_addr_any(saddr)) { + nh = &f6i->fib6_nh; + if (!(nh->fib_nh_flags & RTNH_F_DEAD)) + goto out; + } + + for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { + nh = &spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) { + res->f6i = spf6i; + goto out; } } - if (oif && flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.fib6_null_entry; + if (oif && flags & RT6_LOOKUP_F_IFACE) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + goto out; + } - return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; + nh = &f6i->fib6_nh; + if (nh->fib_nh_flags & RTNH_F_DEAD) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + } +out: + res->nh = nh; + res->fib6_type = res->f6i->fib6_type; + res->fib6_flags = res->f6i->fib6_flags; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -517,7 +548,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct fib6_info *rt) +static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; @@ -533,11 +564,11 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !rt->fib6_nh.fib_nh_has_gw) + if (fib6_nh->fib_nh_gw_family) return; - nh_gw = &rt->fib6_nh.fib_nh_gw6; - dev = rt->fib6_nh.fib_nh_dev; + nh_gw = &fib6_nh->fib_nh_gw6; + dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); @@ -554,13 +585,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, rt->last_probe + + } else if (time_after(jiffies, fib6_nh->last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { - rt->last_probe = jiffies; + fib6_nh->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -572,7 +603,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct fib6_info *rt) +static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif @@ -580,27 +611,14 @@ static inline void rt6_probe(struct fib6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct fib6_info *rt, int oif) -{ - const struct net_device *dev = rt->fib6_nh.fib_nh_dev; - - if (!oif || dev->ifindex == oif) - return 2; - return 0; -} - -static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) +static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->fib6_flags & RTF_NONEXTHOP || - !rt->fib6_nh.fib_nh_has_gw) - return RT6_NUD_SUCCEED; - rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev, - &rt->fib6_nh.fib_nh_gw6); + neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, + &fib6_nh->fib_nh_gw6); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -621,43 +639,44 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) return ret; } -static int rt6_score_route(struct fib6_info *rt, int oif, int strict) +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict) { - int m; + int m = 0; + + if (!oif || nh->fib_nh_dev->ifindex == oif) + m = 2; - m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif - if (strict & RT6_LOOKUP_F_REACHABLE) { - int n = rt6_check_neigh(rt); + if ((strict & RT6_LOOKUP_F_REACHABLE) && + !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { + int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } -static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, - int *mpri, struct fib6_info *match, - bool *do_rr) +static bool find_match(struct fib6_nh *nh, u32 fib6_flags, + int oif, int strict, int *mpri, bool *do_rr) { - int m; bool match_do_rr = false; + bool rc = false; + int m; - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; - if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) && - rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && + if (ip6_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (fib6_check_expired(rt)) - goto out; - - m = rt6_score_route(rt, oif, strict); + m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ @@ -666,67 +685,82 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, } if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(rt); + rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; - match = rt; + rc = true; } out: - return match; + return rc; } -static struct fib6_info *find_rr_leaf(struct fib6_node *fn, - struct fib6_info *leaf, - struct fib6_info *rr_head, - u32 metric, int oif, int strict, - bool *do_rr) +static void __find_rr_leaf(struct fib6_info *f6i_start, + struct fib6_info *nomatch, u32 metric, + struct fib6_result *res, struct fib6_info **cont, + int oif, int strict, bool *do_rr, int *mpri) { - struct fib6_info *rt, *match, *cont; - int mpri = -1; + struct fib6_info *f6i; - match = NULL; - cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + for (f6i = f6i_start; + f6i && f6i != nomatch; + f6i = rcu_dereference(f6i->fib6_next)) { + struct fib6_nh *nh; + + if (cont && f6i->fib6_metric != metric) { + *cont = f6i; + return; } - match = find_match(rt, oif, strict, &mpri, match, do_rr); - } + if (fib6_check_expired(f6i)) + continue; - for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + nh = &f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + res->f6i = f6i; + res->nh = nh; + res->fib6_flags = f6i->fib6_flags; + res->fib6_type = f6i->fib6_type; } - - match = find_match(rt, oif, strict, &mpri, match, do_rr); } +} + +static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, + struct fib6_info *rr_head, int oif, int strict, + bool *do_rr, struct fib6_result *res) +{ + u32 metric = rr_head->fib6_metric; + struct fib6_info *cont = NULL; + int mpri = -1; - if (match || !cont) - return match; + __find_rr_leaf(rr_head, NULL, metric, res, &cont, + oif, strict, do_rr, &mpri); - for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) - match = find_match(rt, oif, strict, &mpri, match, do_rr); + __find_rr_leaf(leaf, rr_head, metric, res, &cont, + oif, strict, do_rr, &mpri); - return match; + if (res->f6i || !cont) + return; + + __find_rr_leaf(cont, NULL, metric, res, NULL, + oif, strict, do_rr, &mpri); } -static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, - int oif, int strict) +static void rt6_select(struct net *net, struct fib6_node *fn, int oif, + struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); - struct fib6_info *match, *rt0; + struct fib6_info *rt0; bool do_rr = false; int key_plen; + /* make sure this function or its helpers sets f6i */ + res->f6i = NULL; + if (!leaf || leaf == net->ipv6.fib6_null_entry) - return net->ipv6.fib6_null_entry; + goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -743,11 +777,9 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.fib6_null_entry; - - match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, - &do_rr); + goto out; + find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); @@ -764,12 +796,19 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, } } - return match ? match : net->ipv6.fib6_null_entry; +out: + if (!res->f6i) { + res->f6i = net->ipv6.fib6_null_entry; + res->nh = &res->f6i->fib6_nh; + res->fib6_flags = res->f6i->fib6_flags; + res->fib6_type = res->f6i->fib6_type; + } } -static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { - return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw; + return (res->f6i->fib6_flags & RTF_NONEXTHOP) || + res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -853,17 +892,17 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, */ /* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) +static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { - struct net_device *dev = rt->fib6_nh.fib_nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; - if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { + if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->fib6_dst.addr)) + !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; @@ -909,11 +948,11 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt) return flags; } -static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { - rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); + rt->dst.error = ip6_rt_type_to_error(fib6_type); - switch (ort->fib6_type) { + switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; @@ -931,26 +970,28 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) } } -static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { - if (ort->fib6_flags & RTF_REJECT) { - ip6_rt_init_dst_reject(rt, ort); + struct fib6_info *f6i = res->f6i; + + if (res->fib6_flags & RTF_REJECT) { + ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; - if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { + if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; - } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { + } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; } - if (ort->fib6_nh.fib_nh_lws) { - rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws); + if (res->nh->fib_nh_lws) { + rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } @@ -965,23 +1006,25 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } -/* Caller must already hold reference to @ort */ -static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) +/* Caller must already hold reference to f6i in result */ +static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { - struct net_device *dev = fib6_info_nh_dev(ort); + const struct fib6_nh *nh = res->nh; + const struct net_device *dev = nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; - ip6_rt_init_dst(rt, ort); + ip6_rt_init_dst(rt, res); - rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; - rt->rt6i_flags = ort->fib6_flags; - if (ort->fib6_nh.fib_nh_has_gw) { - rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6; + rt->rt6i_flags = res->fib6_flags; + if (nh->fib_nh_gw_family) { + rt->rt6i_gateway = nh->fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } - rt6_set_from(rt, ort); + rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->fib6_src; + rt->rt6i_src = f6i->fib6_src; #endif } @@ -1020,22 +1063,24 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) } /* called with rcu_lock held */ -static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) +static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); - struct net_device *dev = rt->fib6_nh.fib_nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; + unsigned short flags; struct rt6_info *nrt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) goto fallback; + flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { - fib6_info_release(rt); + fib6_info_release(f6i); goto fallback; } - ip6_rt_copy_init(nrt, rt); + ip6_rt_copy_init(nrt, res); return nrt; fallback: @@ -1050,7 +1095,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; @@ -1060,37 +1105,38 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - f6i = rcu_dereference(fn->leaf); - if (!f6i) { - f6i = net->ipv6.fib6_null_entry; - } else { - f6i = rt6_device_match(net, f6i, &fl6->saddr, - fl6->flowi6_oif, flags); - if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = fib6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, - flags); - } - if (f6i == net->ipv6.fib6_null_entry) { + res.f6i = rcu_dereference(fn->leaf); + if (!res.f6i) + res.f6i = net->ipv6.fib6_null_entry; + else + rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, + flags); + + if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; + + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + goto out; } - trace_fib6_table_lookup(net, f6i, table, fl6); + fib6_select_path(net, &res, fl6, fl6->flowi6_oif, + fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); - } else if (f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); } else { - rt = ip6_create_rt_rcu(f6i); + rt = ip6_create_rt_rcu(&res); } +out: + trace_fib6_table_lookup(net, &res, table, fl6); + rcu_read_unlock(); return rt; @@ -1156,10 +1202,11 @@ int ip6_ins_rt(struct net *net, struct fib6_info *rt) return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; @@ -1167,25 +1214,25 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ - if (!fib6_info_hold_safe(ort)) + if (!fib6_info_hold_safe(f6i)) return NULL; - dev = ip6_rt_get_dev_rcu(ort); + dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { - fib6_info_release(ort); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(rt, ort); + ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; - if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->fib6_dst.plen != 128 && - ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) + if (!rt6_is_gw_or_nonexthop(res)) { + if (f6i->fib6_dst.plen != 128 && + ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1198,34 +1245,35 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); + struct fib6_info *f6i = res->f6i; + unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); - dev = ip6_rt_get_dev_rcu(rt); + dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) { - fib6_info_release(rt); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(pcpu_rt, rt); + ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt, **p; - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) @@ -1235,18 +1283,18 @@ static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) } static struct rt6_info *rt6_make_pcpu_route(struct net *net, - struct fib6_info *rt) + const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; - pcpu_rt = ip6_rt_pcpu_alloc(rt); + pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) { dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1389,14 +1437,15 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } -static unsigned int fib6_mtu(const struct fib6_info *rt) +static unsigned int fib6_mtu(const struct fib6_result *res) { + const struct fib6_nh *nh = res->nh; unsigned int mtu; - if (rt->fib6_pmtu) { - mtu = rt->fib6_pmtu; + if (res->f6i->fib6_pmtu) { + mtu = res->f6i->fib6_pmtu; } else { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); @@ -1407,26 +1456,27 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } static int rt6_insert_exception(struct rt6_info *nrt, - struct fib6_info *ort) + const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; + struct fib6_info *f6i = res->f6i; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (ort->exception_bucket_flushed) { + if (f6i->exception_bucket_flushed) { err = -EINVAL; goto out; } - bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), @@ -1435,24 +1485,24 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); } #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates ort is in subtree + /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (ort->fib6_src.plen) + if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif - /* rt6_mtu_change() might lower mtu on ort. + /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu - * is less than ort's mtu value. + * is less than f6i's mtu value. */ - if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } @@ -1481,9 +1531,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->fib6_table->tb6_lock); - fib6_update_sernum(net, ort); - spin_unlock_bh(&ort->fib6_table->tb6_lock); + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1520,33 +1570,33 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct rt6_info *res = NULL; + struct rt6_info *ret = NULL; - bucket = rcu_dereference(rt->rt6i_exception_bucket); + bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates rt is in subtree + /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (rt->fib6_src.plen) + if (res->f6i->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) - res = rt6_ex->rt6i; + ret = rt6_ex->rt6i; - return res; + return ret; } /* Remove the passed in cached rt from the hash table that contains it */ @@ -1794,11 +1844,10 @@ void rt6_age_exceptions(struct fib6_info *rt, } /* must be called with rcu lock held */ -struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int strict) +int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; - struct fib6_info *f6i; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1807,8 +1856,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - f6i = rt6_select(net, fn, oif, strict); - if (f6i == net->ipv6.fib6_null_entry) { + rt6_select(net, fn, oif, res, strict); + if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1820,16 +1869,16 @@ redo_rt6_select: } } - trace_fib6_table_lookup(net, f6i, table, fl6); + trace_fib6_table_lookup(net, res, table, fl6); - return f6i; + return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct rt6_info *rt; int strict = 0; @@ -1840,19 +1889,18 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); - f6i = fib6_table_lookup(net, table, oif, fl6, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); - - if (f6i == net->ipv6.fib6_null_entry) { + fib6_table_lookup(net, table, oif, fl6, &res, strict); + if (res.f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); return rt; } + fib6_select_path(net, &res, fl6, oif, false, skb, strict); + /*Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); @@ -1860,7 +1908,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_unlock(); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !f6i->fib6_nh.fib_nh_has_gw)) { + !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -1868,7 +1916,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, */ struct rt6_info *uncached_rt; - uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); + uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); rcu_read_unlock(); @@ -1890,10 +1938,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, struct rt6_info *pcpu_rt; local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(f6i); + pcpu_rt = rt6_get_pcpu_route(&res); if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, f6i); + pcpu_rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); rcu_read_unlock(); @@ -2312,15 +2360,23 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { - struct fib6_info *from; + struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); - from = rcu_dereference(rt6->from); - nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); + res.f6i = rcu_dereference(rt6->from); + if (!res.f6i) { + rcu_read_unlock(); + return; + } + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, from)) + if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } rcu_read_unlock(); @@ -2393,6 +2449,36 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, NULL); } +static bool ip6_redirect_nh_match(const struct fib6_result *res, + struct flowi6 *fl6, + const struct in6_addr *gw, + struct rt6_info **ret) +{ + const struct fib6_nh *nh = res->nh; + + if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || + fl6->flowi6_oif != nh->fib_nh_dev->ifindex) + return false; + + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { + *ret = rt_cache; + return true; + } + return false; + } + return true; +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2406,7 +2492,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *ret = NULL; + struct fib6_result res = {}; struct fib6_info *rt; struct fib6_node *fn; @@ -2424,34 +2511,15 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) - continue; + res.f6i = rt; + res.nh = &rt->fib6_nh; + if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (!rt->fib6_nh.fib_nh_has_gw) - continue; - if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex) - continue; - /* rt_cache's gateway might be different from its 'parent' - * in the case of an ip redirect. - * So we keep searching in the exception table if the gateway - * is different. - */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) { - rt_cache = rt6_find_cached_rt(rt, - &fl6->daddr, - &fl6->saddr); - if (rt_cache && - ipv6_addr_equal(&rdfl->gateway, - &rt_cache->rt6i_gateway)) { - ret = rt_cache; - break; - } - continue; - } - break; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) + goto out; } if (!rt) @@ -2467,15 +2535,20 @@ restart: goto restart; } + res.f6i = rt; + res.nh = &rt->fib6_nh; out: - if (ret) + if (ret) { ip6_hold_safe(net, &ret); - else - ret = ip6_create_rt_rcu(rt); + } else { + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + ret = ip6_create_rt_rcu(&res); + } rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); + trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; @@ -2593,12 +2666,15 @@ out: * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ -u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr) +u32 ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; + const struct fib6_nh *nh = res->nh; + struct fib6_info *f6i = res->f6i; + const struct in6_addr *src_key; struct rt6_exception *rt6_ex; - struct in6_addr *src_key; struct inet6_dev *idev; u32 mtu = 0; @@ -2620,7 +2696,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); if (likely(!mtu)) { - struct net_device *dev = fib6_info_nh_dev(f6i); + struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); @@ -2630,7 +2706,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: - return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, @@ -2964,7 +3040,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; - fib6_nh->fib_nh_has_gw = 1; + fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; @@ -3282,9 +3358,13 @@ static int ip6_route_del(struct fib6_config *cfg, struct fib6_nh *nh; if (cfg->fc_flags & RTF_CACHE) { + struct fib6_result res = { + .f6i = rt, + }; int rc; - rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + rt_cache = rt6_find_cached_rt(&res, + &cfg->fc_dst, &cfg->fc_src); if (rt_cache) { rc = ip6_del_cached_rt(rt_cache, cfg); @@ -3328,10 +3408,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; + struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; - struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3414,14 +3494,17 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NDISC_REDIRECT, &ndopts); rcu_read_lock(); - from = rcu_dereference(rt->from); + res.f6i = rcu_dereference(rt->from); /* This fib6_info_hold() is safe here because we hold reference to rt * and rt already holds reference to fib6_info. */ - fib6_info_hold(from); + fib6_info_hold(res.f6i); rcu_read_unlock(); - nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; @@ -3435,7 +3518,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * a cached route because rt6_insert_exception() will * takes care of it */ - if (rt6_insert_exception(nrt, from)) { + if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3447,7 +3530,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: - fib6_info_release(from); + fib6_info_release(res.f6i); neigh_release(neigh); } @@ -3476,7 +3559,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net, if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_has_gw) + !rt->fib6_nh.fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) continue; @@ -3807,7 +3890,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_has_gw && + rt->fib6_nh.fib_nh_gw_family && ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { return -1; } @@ -3829,7 +3912,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) struct arg_netdev_event { const struct net_device *dev; union { - unsigned int nh_flags; + unsigned char nh_flags; unsigned long event; }; }; @@ -3942,7 +4025,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) return 0; } -void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, @@ -3999,7 +4082,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, - unsigned int nh_flags) + unsigned char nh_flags) { struct fib6_info *iter; @@ -4711,9 +4794,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, nla_nest_end(skb, mp); } else { + unsigned char nh_flags = 0; + if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, - &rtm->rtm_flags, false) < 0) + &nh_flags, false) < 0) goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b444483cdb2b..2464fba569b4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -285,8 +285,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, peeking, off; - int err; + int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); struct udp_mib __percpu *mib; bool checksum_valid = false; @@ -299,9 +298,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = flags & MSG_PEEK; off = sk_peek_offset(sk, flags); - skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; @@ -340,14 +338,14 @@ try_again: goto csum_copy_err; } if (unlikely(err)) { - if (!peeked) { + if (!peeking) { atomic_inc(&sk->sk_drops); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } - if (!peeked) + if (!peeking) SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_ts_and_drops(msg, sk, skb); @@ -1047,6 +1045,8 @@ static void udp_v6_flush_pending_frames(struct sock *sk) static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; /* The following checks are replicated from __ip6_datagram_connect() * and intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len. diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index d4c60523c549..2cac910c1cd4 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -618,6 +618,7 @@ static const struct proto_ops l2tp_ip_ops = { .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 37a69df17cab..4ec546cc1dd6 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -752,6 +752,7 @@ static const struct proto_ops l2tp_ip6_ops = { .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 04d9946dcdba..f36cae785e82 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1070,7 +1070,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; - int val; switch (cmd) { case PPPIOCGMRU: @@ -1097,7 +1096,7 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if (!session->session_id && !session->peer_session_id) return -ENOSYS; - if (get_user(val, (int __user *)arg)) + if (!access_ok((int __user *)arg, sizeof(int))) return -EFAULT; break; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b99e73a7e7e0..2017b7d780f5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -320,14 +320,13 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) struct llc_sap *sap; int rc = -EINVAL; - dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); - lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; + dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 62edfa6a73ed..c2d8b5451a5e 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1200,6 +1200,9 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); + if (local->in_reconfig) + return; + if (!check_sdata_in_driver(sdata)) return; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 42d52cded4c1..20bf9db7a388 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -173,8 +173,10 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * The driver doesn't know anything about VLAN interfaces. * Hence, don't send GTKs for VLAN interfaces to the driver. */ - if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { + ret = 1; goto out_unsupported; + } } ret = drv_set_key(key->local, SET_KEY, sdata, @@ -219,11 +221,8 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) { - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - return 0; + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; - } return 0; default: return -EINVAL; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index a805d2acf0f7..796b069ad251 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -23,7 +23,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath); static u32 mesh_table_hash(const void *addr, u32 len, u32 seed) { /* Use last four bytes of hw addr as hash index */ - return jhash_1word(*(u32 *)(addr+2), seed); + return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed); } static const struct rhashtable_params mesh_rht_params = { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4a03c18b39a8..25577ede2986 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1588,7 +1588,15 @@ static void sta_ps_start(struct sta_info *sta) return; for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { - if (txq_has_queue(sta->sta.txq[tid])) + struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct txq_info *txqi = to_txq_info(txq); + + spin_lock(&local->active_txq_lock[txq->ac]); + if (!list_empty(&txqi->schedule_order)) + list_del_init(&txqi->schedule_order); + spin_unlock(&local->active_txq_lock[txq->ac]); + + if (txq_has_queue(txq)) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h index 366b9e6f043e..40141df09f25 100644 --- a/net/mac80211/trace_msg.h +++ b/net/mac80211/trace_msg.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions of this file + * Copyright (C) 2019 Intel Corporation + */ + #ifdef CONFIG_MAC80211_MESSAGE_TRACING #if !defined(__MAC80211_MSG_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -11,7 +16,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211_msg -#define MAX_MSG_LEN 100 +#define MAX_MSG_LEN 120 DECLARE_EVENT_CLASS(mac80211_msg_event, TP_PROTO(struct va_format *vaf), diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8037384fc06e..dd220b977025 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3228,6 +3228,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, u8 max_subframes = sta->sta.max_amsdu_subframes; int max_frags = local->hw.max_tx_fragments; int max_amsdu_len = sta->sta.max_amsdu_len; + int orig_truesize; u32 flow_idx; __be16 len; void *data; @@ -3272,6 +3273,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!head || skb_is_gso(head)) goto out; + orig_truesize = head->truesize; orig_len = head->len; if (skb->len + head->len > max_amsdu_len) @@ -3329,6 +3331,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, *frag_tail = skb; out_recalc: + fq->memory_usage += head->truesize - orig_truesize; if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; @@ -3669,16 +3672,17 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue); struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_txq *ret = NULL; struct txq_info *txqi = NULL; - lockdep_assert_held(&local->active_txq_lock[ac]); + spin_lock_bh(&local->active_txq_lock[ac]); begin: txqi = list_first_entry_or_null(&local->active_txqs[ac], struct txq_info, schedule_order); if (!txqi) - return NULL; + goto out; if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, @@ -3695,24 +3699,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) if (txqi->schedule_round == local->schedule_round[ac]) - return NULL; + goto out; list_del_init(&txqi->schedule_order); txqi->schedule_round = local->schedule_round[ac]; - return &txqi->txq; + ret = &txqi->txq; + +out: + spin_unlock_bh(&local->active_txq_lock[ac]); + return ret; } EXPORT_SYMBOL(ieee80211_next_txq); -void ieee80211_return_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq) +void __ieee80211_schedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool force) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); - lockdep_assert_held(&local->active_txq_lock[txq->ac]); + spin_lock_bh(&local->active_txq_lock[txq->ac]); if (list_empty(&txqi->schedule_order) && - (!skb_queue_empty(&txqi->frags) || txqi->tin.backlog_packets)) { + (force || !skb_queue_empty(&txqi->frags) || + txqi->tin.backlog_packets)) { /* If airtime accounting is active, always enqueue STAs at the * head of the list to ensure that they only get moved to the * back by the airtime DRR scheduler once they have a negative @@ -3729,20 +3739,10 @@ void ieee80211_return_txq(struct ieee80211_hw *hw, list_add_tail(&txqi->schedule_order, &local->active_txqs[txq->ac]); } -} -EXPORT_SYMBOL(ieee80211_return_txq); -void ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq) - __acquires(txq_lock) __releases(txq_lock) -{ - struct ieee80211_local *local = hw_to_local(hw); - - spin_lock_bh(&local->active_txq_lock[txq->ac]); - ieee80211_return_txq(hw, txq); spin_unlock_bh(&local->active_txq_lock[txq->ac]); } -EXPORT_SYMBOL(ieee80211_schedule_txq); +EXPORT_SYMBOL(__ieee80211_schedule_txq); bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) @@ -3752,7 +3752,7 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct sta_info *sta; u8 ac = txq->ac; - lockdep_assert_held(&local->active_txq_lock[ac]); + spin_lock_bh(&local->active_txq_lock[ac]); if (!txqi->txq.sta) goto out; @@ -3782,34 +3782,27 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); + spin_unlock_bh(&local->active_txq_lock[ac]); return false; out: if (!list_empty(&txqi->schedule_order)) list_del_init(&txqi->schedule_order); + spin_unlock_bh(&local->active_txq_lock[ac]); return true; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) - __acquires(txq_lock) { struct ieee80211_local *local = hw_to_local(hw); spin_lock_bh(&local->active_txq_lock[ac]); local->schedule_round[ac]++; -} -EXPORT_SYMBOL(ieee80211_txq_schedule_start); - -void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) - __releases(txq_lock) -{ - struct ieee80211_local *local = hw_to_local(hw); - spin_unlock_bh(&local->active_txq_lock[ac]); } -EXPORT_SYMBOL(ieee80211_txq_schedule_end); +EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8120e04f15e4..e321a5fafb87 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -23,7 +23,7 @@ #include <net/ipv6.h> #endif #include <net/ipv6_stubs.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include "internal.h" /* max memory we will use for mpls_route */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index f3a8557494d6..2619c2fbea93 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -137,10 +137,14 @@ static int mpls_xmit(struct sk_buff *skb) mpls_stats_inc_outucastpkts(out_dev, skb); - if (rt) - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, - skb); - else if (rt6) { + if (rt) { + if (rt->rt_gw_family == AF_INET) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, + skb); + else if (rt->rt_gw_family == AF_INET6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, + skb); + } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index dc07fcc7938e..802db01e3075 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/netdevice.h> +#include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/ncsi.h> @@ -667,7 +668,10 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN); /* Increase mac address by 1 for BMC's address */ - saddr.sa_data[ETH_ALEN - 1]++; + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6548271209a0..02b281d3c167 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -404,11 +404,6 @@ config NF_NAT forms of full Network Address Port Translation. This can be controlled by iptables, ip6tables or nft. -config NF_NAT_NEEDED - bool - depends on NF_NAT - default y - config NF_NAT_AMANDA tristate depends on NF_CONNTRACK && NF_NAT @@ -1002,6 +997,20 @@ config NETFILTER_XT_TARGET_REDIRECT To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on NF_NAT + default m if NETFILTER_ADVANCED=n + select NF_NAT_MASQUERADE + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4894a85cdd0b..72cca6b48960 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -77,7 +77,8 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ + nft_chain_route.o nf_tables_set-objs := nf_tables_set_core.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o @@ -147,6 +148,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_MASQUERADE) += xt_MASQUERADE.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 93aaec3a54ec..71f06900473e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> +#include <net/netfilter/nf_queue.h> #include <net/sock.h> #include "nf_internals.h" diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 43bbaa32b1d6..14457551bcb4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (!cp) { int v; - if (!sysctl_schedule_icmp(ipvs)) + if (ipip || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4b933669fd83..ab119a7540db 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -831,6 +831,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* set the tunnel info */ + dest->tun_type = udest->tun_type; + dest->tun_port = udest->tun_port; + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -987,6 +991,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -1051,6 +1062,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -2333,6 +2351,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int @@ -2890,6 +2909,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3193,6 +3214,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, + dest->tun_type) || + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, + dest->tun_port) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3315,12 +3340,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; + *nla_l_thresh, *nla_tun_type, *nla_tun_port; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3330,6 +3357,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); + + if (nla_tun_type) + udest->tun_type = nla_get_u8(nla_tun_type); + + if (nla_tun_port) + udest->tun_port = nla_get_be16(nla_tun_port); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 175349fcf91f..8d6f94b67772 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -32,6 +32,7 @@ #include <linux/slab.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> +#include <net/gue.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -382,6 +383,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); } else { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -533,6 +538,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); else { mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -989,6 +998,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af) } } +static int +ipvs_gue_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 dport; + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); + struct udphdr *udph; /* Our new UDP header */ + struct guehdr *gueh; /* Our new GUE header */ + + skb_push(skb, sizeof(struct guehdr)); + + gueh = (struct guehdr *)skb->data; + + gueh->control = 0; + gueh->version = 0; + gueh->hlen = 0; + gueh->flags = 0; + gueh->proto_ctype = *next_protocol; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + + dport = cp->dest->tun_port; + udph->dest = dport; + udph->source = sport; + udph->len = htons(skb->len); + udph->check = 0; + + *next_protocol = IPPROTO_UDP; + + return 0; +} + /* * IP Tunneling transmitter * @@ -1025,6 +1069,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); @@ -1046,6 +1091,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, @@ -1054,11 +1104,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1102,6 +1161,8 @@ int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -1112,10 +1173,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | @@ -1134,17 +1196,31 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, &dsfield, &ttl, NULL); if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET6, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1167,7 +1243,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(cp->ipvs->net, skb->sk, skb); + ip6_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 82bfbeef46af..2a714527cde1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> @@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); +/* Generate a almost-unique pseudo-id for a given conntrack. + * + * intentionally doesn't re-use any of the seeds used for hash + * table location, we assume id gets exposed to userspace. + * + * Following nf_conn items do not change throughout lifetime + * of the nf_conn after it has been committed to main hash table: + * + * 1. nf_conn address + * 2. nf_conn->ext address + * 3. nf_conn->master address (normally NULL) + * 4. tuple + * 5. the associated net namespace + */ +u32 nf_ct_get_id(const struct nf_conn *ct) +{ + static __read_mostly siphash_key_t ct_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); + + a = (unsigned long)ct; + b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); + c = (unsigned long)ct->ext; + d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), + &ct_id_seed); +#ifdef CONFIG_64BIT + return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); +#else + return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); +#endif +} +EXPORT_SYMBOL_GPL(nf_ct_get_id); + static void clean_from_lists(struct nf_conn *ct) { @@ -982,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); - if (tstamp) { - if (skb->tstamp == 0) - __net_timestamp(skb); + if (tstamp) + tstamp->start = ktime_get_real_ns(); - tstamp->start = ktime_to_ns(skb->tstamp); - } /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above @@ -1350,6 +1382,7 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; + ct->timeout = 0; write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 334d6e5b7762..59c18804a10a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -336,7 +336,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, exp->tuple.dst.u.all = *dst; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 66c596d287a5..d547a777192f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -29,6 +29,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> +#include <linux/siphash.h> #include <linux/netfilter.h> #include <net/netlink.h> @@ -45,7 +46,7 @@ #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #endif @@ -485,7 +486,9 @@ nla_put_failure: static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) + __be32 id = (__force __be32)nf_ct_get_id(ct); + + if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; @@ -655,7 +658,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif @@ -1286,8 +1289,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_ID]) { - u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); - if (id != (u32)(unsigned long)ct) { + __be32 id = nla_get_be32(cda[CTA_ID]); + + if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } @@ -1494,7 +1498,7 @@ static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, return -EOPNOTSUPP; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, @@ -1586,7 +1590,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) static int ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) int ret; if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) @@ -2369,7 +2373,7 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif @@ -2692,6 +2696,25 @@ nla_put_failure: static const union nf_inet_addr any_addr; +static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) +{ + static __read_mostly siphash_key_t exp_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); + + a = (unsigned long)exp; + b = (unsigned long)exp->helper; + c = (unsigned long)exp->master; + d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); + +#ifdef CONFIG_64BIT + return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); +#else + return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); +#endif +} + static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) @@ -2699,7 +2722,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conn_help *help; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; #endif @@ -2717,7 +2740,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || exp->saved_proto.all) { nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); @@ -2739,7 +2762,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || - nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || + nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; @@ -3044,7 +3067,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } @@ -3180,7 +3204,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_expect *exp, u_int8_t u3) { -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; struct nf_conntrack_tuple nat_tuple = {}; int err; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b9403a266a2e..37bb530d848f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, struct va_format vaf; va_list args; - if (net->ct.sysctl_log_invalid != protonum || + if (net->ct.sysctl_log_invalid != protonum && net->ct.sysctl_log_invalid != IPPROTO_RAW) return; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 7df477996b16..9becac953587 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, - const struct nf_hook_state *state) +/* Check inner header is related to any of the existing connections */ +int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state, + u8 l4proto, union nf_inet_addr *outer_daddr) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + union nf_inet_addr *ct_daddr; + enum ip_conntrack_dir dir; + struct nf_conn *ct; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, state->net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); + if (!nf_ct_get_tuplepr(skb, dataoff, + state->pf, state->net, &origtuple)) return -NF_ACCEPT; - } /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple)) { - pr_debug("icmp_error_message: no match\n"); + if (!nf_ct_invert_tuple(&innertuple, &origtuple)) return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(state->net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); + if (!h) + return -NF_ACCEPT; + + /* Consider: A -> T (=This machine) -> B + * Conntrack entry will look like this: + * Original: A->B + * Reply: B->T (SNAT case) OR A + * + * When this function runs, we got packet that looks like this: + * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..). + * + * Above nf_conntrack_find_get() makes lookup based on inner_hdr, + * so we should expect that destination of the found connection + * matches outer header destination address. + * + * In above example, we can consider these two cases: + * 1. Error coming in reply direction from B or M (middle box) to + * T (SNAT case) or A. + * Inner saddr will be B, dst will be T or A. + * The found conntrack will be reply tuple (B->T/A). + * 2. Error coming in original direction from A or M to B. + * Inner saddr will be A, inner daddr will be B. + * The found conntrack will be original tuple (A->B). + * + * In both cases, conntrack[dir].dst == inner.dst. + * + * A bogus packet could look like this: + * Inner: B->T + * Outer: B->X (other machine reachable by T). + * + * In this case, lookup yields connection A->B and will + * set packet from B->X as *RELATED*, even though no connection + * from X was ever seen. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + dir = NF_CT_DIRECTION(h); + ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; + if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { + if (state->pf == AF_INET) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI4 != inner %pI4", + &outer_daddr->ip, &ct_daddr->ip); + } else if (state->pf == AF_INET6) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI6 != inner %pI6", + &outer_daddr->ip6, &ct_daddr->ip6); + } + nf_ct_put(ct); return -NF_ACCEPT; } - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo = IP_CT_RELATED; + if (dir == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + nf_ct_set(skb, ct, ctinfo); return NF_ACCEPT; } @@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; @@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(tmpl, skb, state); + memset(&outer_daddr, 0, sizeof(outer_daddr)); + outer_daddr.ip = ip_hdr(skb)->daddr; + + dataoff += sizeof(*icmph); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMP, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index bec4a3211658..c63ee3612855 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, return NF_ACCEPT; } -static int -icmpv6_error_message(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int icmp6off) -{ - struct nf_conntrack_tuple intuple, origtuple; - const struct nf_conntrack_tuple_hash *h; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) - + sizeof(struct ipv6hdr) - + sizeof(struct icmp6hdr), - PF_INET6, net, &origtuple)) { - pr_debug("icmpv6_error: Can't get tuple\n"); - return -NF_ACCEPT; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple)) { - pr_debug("icmpv6_error: Can't invert tuple\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), - &intuple); - if (!h) { - pr_debug("icmpv6_error: no match\n"); - return -NF_ACCEPT; - } else { - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, @@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; @@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(state->net, tmpl, skb, dataoff); + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += sizeof(*icmp6h); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 39fcc1ed18f3..d5454d1031a3 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -928,7 +928,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, nfct_help(exp->master)->helper != nfct_help(ct)->helper || exp->class != class) break; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (!direct_rtp && (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) || exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 1d291a51cd45..6452550d187f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -235,13 +235,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) @@ -452,13 +449,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index e15779fd58e3..d6c43902ebd7 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -7,9 +7,6 @@ #include <linux/netdevice.h> /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict); void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index af7dc6537758..715e3d4d761b 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; - min = range->min_proto.icmp.id; - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + min = 0; + range_size = 65536; + } else { + min = ntohs(range->min_proto.icmp.id); + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: @@ -1009,7 +1014,7 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; -int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, +int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); @@ -1019,14 +1024,12 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, struct nf_hook_ops *nat_ops; int i, ret; - if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))) + if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; - nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { - if (WARN_ON(orig_nat_ops[i].pf != ops->pf)) - return -EINVAL; if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; @@ -1086,8 +1089,8 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, return ret; } -void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, - unsigned int ops_count) +void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, + unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; @@ -1096,10 +1099,10 @@ void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, int hooknum = ops->hooknum; int i; - if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)) + if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; - nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index d85c4d902e7b..8e8a65d46345 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -7,12 +7,10 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> static DEFINE_MUTEX(masq_mutex); -static unsigned int masq_refcnt4 __read_mostly; -static unsigned int masq_refcnt6 __read_mostly; +static unsigned int masq_refcnt __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -137,56 +135,6 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -int nf_nat_masquerade_ipv4_register_notifier(void) -{ - int ret = 0; - - mutex_lock(&masq_mutex); - if (WARN_ON_ONCE(masq_refcnt4 == UINT_MAX)) { - ret = -EOVERFLOW; - goto out_unlock; - } - - /* check if the notifier was already set */ - if (++masq_refcnt4 > 1) - goto out_unlock; - - /* Register for device down reports */ - ret = register_netdevice_notifier(&masq_dev_notifier); - if (ret) - goto err_dec; - /* Register IP address change reports */ - ret = register_inetaddr_notifier(&masq_inet_notifier); - if (ret) - goto err_unregister; - - mutex_unlock(&masq_mutex); - return ret; - -err_unregister: - unregister_netdevice_notifier(&masq_dev_notifier); -err_dec: - masq_refcnt4--; -out_unlock: - mutex_unlock(&masq_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); - -void nf_nat_masquerade_ipv4_unregister_notifier(void) -{ - mutex_lock(&masq_mutex); - /* check if the notifier still has clients */ - if (--masq_refcnt4 > 0) - goto out_unlock; - - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); -out_unlock: - mutex_unlock(&masq_mutex); -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); - #if IS_ENABLED(CONFIG_IPV6) static atomic_t v6_worker_count __read_mostly; @@ -322,44 +270,68 @@ static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; -int nf_nat_masquerade_ipv6_register_notifier(void) +static int nf_nat_masquerade_ipv6_register_notifier(void) +{ + return register_inet6addr_notifier(&masq_inet6_notifier); +} +#else +static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } +#endif + +int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); - if (WARN_ON_ONCE(masq_refcnt6 == UINT_MAX)) { + if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } - /* check if the notifier is already set */ - if (++masq_refcnt6 > 1) + /* check if the notifier was already set */ + if (++masq_refcnt > 1) goto out_unlock; - ret = register_inet6addr_notifier(&masq_inet6_notifier); + /* Register for device down reports */ + ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; + /* Register IP address change reports */ + ret = register_inetaddr_notifier(&masq_inet_notifier); + if (ret) + goto err_unregister; + + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (ret) + goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; +err_unreg_inet: + unregister_inetaddr_notifier(&masq_inet_notifier); +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); err_dec: - masq_refcnt6--; + masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); +EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); -void nf_nat_masquerade_ipv6_unregister_notifier(void) +void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); - /* check if the notifier still has clients */ - if (--masq_refcnt6 > 0) + /* check if the notifiers still have clients */ + if (--masq_refcnt > 0) goto out_unlock; + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +#if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); +#endif out_unlock: mutex_unlock(&masq_mutex); } -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); -#endif +EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers); diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 62743da3004f..84f5c90a7f21 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -725,7 +725,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, return ret; } -static const struct nf_hook_ops nf_nat_ipv4_ops[] = { +const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_in, @@ -758,13 +758,14 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, + ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { - nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); @@ -925,20 +926,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, return ret; } -static int nat_route_me_harder(struct net *net, struct sk_buff *skb) -{ -#ifdef CONFIG_IPV6_MODULE - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return -EHOSTUNREACH; - - return v6_ops->route_me_harder(net, skb); -#else - return ip6_route_me_harder(net, skb); -#endif -} - static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -958,7 +945,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = nat_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -977,7 +964,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, return ret; } -static const struct nf_hook_ops nf_nat_ipv6_ops[] = { +const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, @@ -1010,14 +997,44 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, + return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { - nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ + +#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) +int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) +{ + int ret; + + if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) + return -EINVAL; + + ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, + ARRAY_SIZE(nf_nat_ipv6_ops)); + if (ret) + return ret; + + ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, + ARRAY_SIZE(nf_nat_ipv4_ops)); + if (ret) + nf_nat_ipv6_unregister_fn(net, ops); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); + +void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); +#endif /* NFT INET NAT */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a36a77bae1d6..9dc1d6e04946 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -240,6 +240,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, return 0; } +EXPORT_SYMBOL_GPL(nf_queue); static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 90e6b09ef2af..9d888dc6be38 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1544,7 +1544,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (!(type->hook_mask & (1 << hook->num))) + if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && @@ -3193,9 +3193,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) static __be64 nf_jiffies64_to_msecs(u64 input) { - u64 ms = jiffies64_to_nsecs(input); - - return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC)); + return cpu_to_be64(jiffies64_to_msecs(input)); } static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, @@ -3438,8 +3436,7 @@ err: return err; } -static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, - struct nft_set_desc *desc, +static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { struct nlattr *da[NFTA_SET_DESC_MAX + 1]; @@ -3565,7 +3562,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); if (nla[NFTA_SET_DESC] != NULL) { - err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; } @@ -3785,8 +3782,8 @@ bind: } EXPORT_SYMBOL_GPL(nf_tables_bind_set); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding, bool event) +static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding, bool event) { list_del_rcu(&binding->list); @@ -3797,7 +3794,6 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, GFP_KERNEL); } } -EXPORT_SYMBOL_GPL(nf_tables_unbind_set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, @@ -7533,6 +7529,7 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err5; + nft_chain_route_init(); return err; err5: rhltable_destroy(&nft_objname_ht); @@ -7552,6 +7549,7 @@ static void __exit nf_tables_module_exit(void) nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); + nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_destroy_work); rcu_barrier(); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b1f9c5303f02..0b3347570265 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - if (skb->tstamp) { + if (hooknum <= NF_INET_FORWARD && skb->tstamp) { struct nfulnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 1f1d90c1716b..7b827bcb412c 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -255,9 +255,9 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, } EXPORT_SYMBOL_GPL(nf_osf_match); -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers, - const int ttl_check) +bool nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers, + const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; @@ -265,24 +265,24 @@ const char *nf_osf_find(const struct sk_buff *skb, const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; - const char *genre = NULL; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); if (!tcp) - return NULL; + return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; - genre = f->genre; + data->genre = f->genre; + data->version = f->version; break; } - return genre; + return true; } EXPORT_SYMBOL_GPL(nf_osf_find); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 0dcc3592d053..e057b2961d31 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; - if (entskb->tstamp) { + if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index ee4852088d50..2f89bde3c61c 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -74,6 +74,36 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = { }; #endif +#ifdef CONFIG_NF_TABLES_INET +static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops) +{ + return nf_nat_inet_register_fn(net, ops); +} + +static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_inet_unregister_fn(net, ops); +} + +static const struct nft_chain_type nft_chain_nat_inet = { + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_PRE_ROUTING] = nft_nat_do_chain, + [NF_INET_LOCAL_IN] = nft_nat_do_chain, + [NF_INET_LOCAL_OUT] = nft_nat_do_chain, + [NF_INET_POST_ROUTING] = nft_nat_do_chain, + }, + .ops_register = nft_nat_inet_reg, + .ops_unregister = nft_nat_inet_unreg, +}; +#endif + static int __init nft_chain_nat_init(void) { #ifdef CONFIG_NF_TABLES_IPV6 @@ -82,6 +112,9 @@ static int __init nft_chain_nat_init(void) #ifdef CONFIG_NF_TABLES_IPV4 nft_register_chain_type(&nft_chain_nat_ipv4); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_register_chain_type(&nft_chain_nat_inet); +#endif return 0; } @@ -94,6 +127,9 @@ static void __exit nft_chain_nat_exit(void) #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_chain_type(&nft_chain_nat_ipv6); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_chain_type(&nft_chain_nat_inet); +#endif } module_init(nft_chain_nat_init); diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c new file mode 100644 index 000000000000..8826bbe71136 --- /dev/null +++ b/net/netfilter/nft_chain_route.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/route.h> +#include <net/ip.h> + +#ifdef CONFIG_NF_TABLES_IPV4 +static unsigned int nf_route_table_hook4(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + const struct iphdr *iph; + struct nft_pktinfo pkt; + __be32 saddr, daddr; + unsigned int ret; + u32 mark; + int err; + u8 tos; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); + + mark = skb->mark; + iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; + + ret = nft_do_chain(&pkt, priv); + if (ret == NF_ACCEPT) { + iph = ip_hdr(skb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } + return ret; +} + +static const struct nft_chain_type nft_chain_route_ipv4 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook4, + }, +}; +#endif + +#ifdef CONFIG_NF_TABLES_IPV6 +static unsigned int nf_route_table_hook6(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct in6_addr saddr, daddr; + struct nft_pktinfo pkt; + u32 mark, flowlabel; + unsigned int ret; + u8 hop_limit; + int err; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); + + /* save source/dest address, mark, hoplimit, flowlabel, priority */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either)*/ + flowlabel = *((u32 *)ipv6_hdr(skb)); + + ret = nft_do_chain(&pkt, priv); + if (ret == NF_ACCEPT && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u32 *)ipv6_hdr(skb)))) { + err = nf_ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } + + return ret; +} + +static const struct nft_chain_type nft_chain_route_ipv6 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV6, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook6, + }, +}; +#endif + +#ifdef CONFIG_NF_TABLES_INET +static unsigned int nf_route_table_inet(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (state->pf) { + case NFPROTO_IPV4: + return nf_route_table_hook4(priv, skb, state); + case NFPROTO_IPV6: + return nf_route_table_hook6(priv, skb, state); + default: + nft_set_pktinfo(&pkt, skb, state); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_route_inet = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_inet, + }, +}; +#endif + +void __init nft_chain_route_init(void) +{ +#ifdef CONFIG_NF_TABLES_IPV6 + nft_register_chain_type(&nft_chain_route_ipv6); +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + nft_register_chain_type(&nft_chain_route_ipv4); +#endif +#ifdef CONFIG_NF_TABLES_INET + nft_register_chain_type(&nft_chain_route_inet); +#endif +} + +void __exit nft_chain_route_fini(void) +{ +#ifdef CONFIG_NF_TABLES_IPV6 + nft_unregister_chain_type(&nft_chain_route_ipv6); +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + nft_unregister_chain_type(&nft_chain_route_ipv4); +#endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_chain_type(&nft_chain_route_inet); +#endif +} diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index bee156eaa400..86fd90085eaf 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -14,8 +14,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> struct nft_masq { u32 flags; @@ -196,28 +195,73 @@ static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { static int __init nft_masq_module_init_ipv6(void) { - int ret = nft_register_expr(&nft_masq_ipv6_type); - - if (ret) - return ret; - - ret = nf_nat_masquerade_ipv6_register_notifier(); - if (ret < 0) - nft_unregister_expr(&nft_masq_ipv6_type); - - return ret; + return nft_register_expr(&nft_masq_ipv6_type); } static void nft_masq_module_exit_ipv6(void) { nft_unregister_expr(&nft_masq_ipv6_type); - nf_nat_masquerade_ipv6_unregister_notifier(); } #else static inline int nft_masq_module_init_ipv6(void) { return 0; } static inline void nft_masq_module_exit_ipv6(void) {} #endif +#ifdef CONFIG_NF_TABLES_INET +static void nft_masq_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return nft_masq_ipv4_eval(expr, regs, pkt); + case NFPROTO_IPV6: + return nft_masq_ipv6_eval(expr, regs, pkt); + } + + WARN_ON_ONCE(1); +} + +static void +nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_INET); +} + +static struct nft_expr_type nft_masq_inet_type; +static const struct nft_expr_ops nft_masq_inet_ops = { + .type = &nft_masq_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_inet_eval, + .init = nft_masq_init, + .destroy = nft_masq_inet_destroy, + .dump = nft_masq_dump, + .validate = nft_masq_validate, +}; + +static struct nft_expr_type nft_masq_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "masq", + .ops = &nft_masq_inet_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_module_init_inet(void) +{ + return nft_register_expr(&nft_masq_inet_type); +} + +static void nft_masq_module_exit_inet(void) +{ + nft_unregister_expr(&nft_masq_inet_type); +} +#else +static inline int nft_masq_module_init_inet(void) { return 0; } +static inline void nft_masq_module_exit_inet(void) {} +#endif + static int __init nft_masq_module_init(void) { int ret; @@ -226,15 +270,23 @@ static int __init nft_masq_module_init(void) if (ret < 0) return ret; + ret = nft_masq_module_init_inet(); + if (ret < 0) { + nft_masq_module_exit_ipv6(); + return ret; + } + ret = nft_register_expr(&nft_masq_ipv4_type); if (ret < 0) { + nft_masq_module_exit_inet(); nft_masq_module_exit_ipv6(); return ret; } - ret = nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_inet_register_notifiers(); if (ret < 0) { nft_masq_module_exit_ipv6(); + nft_masq_module_exit_inet(); nft_unregister_expr(&nft_masq_ipv4_type); return ret; } @@ -245,8 +297,9 @@ static int __init nft_masq_module_init(void) static void __exit nft_masq_module_exit(void) { nft_masq_module_exit_ipv6(); + nft_masq_module_exit_inet(); nft_unregister_expr(&nft_masq_ipv4_type); - nf_nat_masquerade_ipv4_unregister_notifier(); + nf_nat_masquerade_inet_unregister_notifiers(); } module_init(nft_masq_module_init); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index e93aed9bda88..d90d421826aa 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -140,7 +140,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (family != ctx->family) + if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { @@ -278,13 +278,67 @@ static struct nft_expr_type nft_nat_type __read_mostly = { .owner = THIS_MODULE, }; +#ifdef CONFIG_NF_TABLES_INET +static void nft_nat_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + + if (priv->family == nft_pf(pkt)) + nft_nat_eval(expr, regs, pkt); +} + +static const struct nft_expr_ops nft_nat_inet_ops = { + .type = &nft_nat_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), + .eval = nft_nat_inet_eval, + .init = nft_nat_init, + .destroy = nft_nat_destroy, + .dump = nft_nat_dump, + .validate = nft_nat_validate, +}; + +static struct nft_expr_type nft_inet_nat_type __read_mostly = { + .name = "nat", + .family = NFPROTO_INET, + .ops = &nft_nat_inet_ops, + .policy = nft_nat_policy, + .maxattr = NFTA_NAT_MAX, + .owner = THIS_MODULE, +}; + +static int nft_nat_inet_module_init(void) +{ + return nft_register_expr(&nft_inet_nat_type); +} + +static void nft_nat_inet_module_exit(void) +{ + nft_unregister_expr(&nft_inet_nat_type); +} +#else +static int nft_nat_inet_module_init(void) { return 0; } +static void nft_nat_inet_module_exit(void) { } +#endif + static int __init nft_nat_module_init(void) { - return nft_register_expr(&nft_nat_type); + int ret = nft_nat_inet_module_init(); + + if (ret) + return ret; + + ret = nft_register_expr(&nft_nat_type); + if (ret) + nft_nat_inet_module_exit(); + + return ret; } static void __exit nft_nat_module_exit(void) { + nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index b13618c764ec..87b60d6617ef 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -7,11 +7,13 @@ struct nft_osf { enum nft_registers dreg:8; u8 ttl; + u32 flags; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, [NFTA_OSF_TTL] = { .type = NLA_U8 }, + [NFTA_OSF_FLAGS] = { .type = NLA_U32 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -20,9 +22,10 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; + char os_match[NFT_OSF_MAXGENRELEN + 1]; const struct tcphdr *tcp; + struct nf_osf_data data; struct tcphdr _tcph; - const char *os_name; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); @@ -35,11 +38,17 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl); - if (!os_name) + if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); - else - strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN); + } else { + if (priv->flags & NFT_OSF_F_VERSION) + snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", + data.genre, data.version); + else + strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); + + strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + } } static int nft_osf_init(const struct nft_ctx *ctx, @@ -47,6 +56,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_osf *priv = nft_expr_priv(expr); + u32 flags; int err; u8 ttl; @@ -57,6 +67,13 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->ttl = ttl; } + if (tb[NFTA_OSF_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS])); + if (flags != NFT_OSF_F_VERSION) + return -EINVAL; + priv->flags = flags; + } + priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); @@ -73,6 +90,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags))) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a340cd8a751b..da74fdc4a684 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -82,7 +82,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_redir *priv = nft_expr_priv(expr); @@ -202,6 +202,55 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { }; #endif +#ifdef CONFIG_NF_TABLES_INET +static void nft_redir_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return nft_redir_ipv4_eval(expr, regs, pkt); + case NFPROTO_IPV6: + return nft_redir_ipv6_eval(expr, regs, pkt); + } + + WARN_ON_ONCE(1); +} + +static void +nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_INET); +} + +static struct nft_expr_type nft_redir_inet_type; +static const struct nft_expr_ops nft_redir_inet_ops = { + .type = &nft_redir_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_inet_eval, + .init = nft_redir_init, + .destroy = nft_redir_inet_destroy, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "redir", + .ops = &nft_redir_inet_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_module_init_inet(void) +{ + return nft_register_expr(&nft_redir_inet_type); +} +#else +static inline int nft_redir_module_init_inet(void) { return 0; } +#endif + static int __init nft_redir_module_init(void) { int ret = nft_register_expr(&nft_redir_ipv4_type); @@ -217,6 +266,15 @@ static int __init nft_redir_module_init(void) } #endif + ret = nft_redir_module_init_inet(); + if (ret < 0) { + nft_unregister_expr(&nft_redir_ipv4_type); +#ifdef CONFIG_NF_TABLES_IPV6 + nft_unregister_expr(&nft_redir_ipv6_type); +#endif + return ret; + } + return ret; } @@ -226,6 +284,9 @@ static void __exit nft_redir_module_exit(void) #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_expr(&nft_redir_ipv6_type); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_expr(&nft_redir_inet_type); +#endif } module_init(nft_redir_module_init); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e5e5c64df8d1..0a6656ed1534 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -227,7 +227,7 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) EXPORT_SYMBOL_GPL(xt_request_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ -struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) +static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = -ENOENT; @@ -255,7 +255,6 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) return ERR_PTR(err); } -EXPORT_SYMBOL(xt_find_target); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c index fd3f9e8a74da..ece20d832adc 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/netfilter/xt_MASQUERADE.c @@ -9,20 +9,10 @@ * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/inetdevice.h> -#include <linux/ip.h> -#include <linux/timer.h> #include <linux/module.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -64,38 +54,78 @@ static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target masquerade_tg_reg __read_mostly = { - .name = "MASQUERADE", - .family = NFPROTO_IPV4, - .target = masquerade_tg, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, - .checkentry = masquerade_tg_check, - .destroy = masquerade_tg_destroy, - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IPV6) +static unsigned int +masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); +} + +static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + + return nf_ct_netns_get(par->net, par->family); +} +#endif + +static struct xt_target masquerade_tg_reg[] __read_mostly = { + { +#if IS_ENABLED(CONFIG_IPV6) + .name = "MASQUERADE", + .family = NFPROTO_IPV6, + .target = masquerade_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + }, { +#endif + .name = "MASQUERADE", + .family = NFPROTO_IPV4, + .target = masquerade_tg, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + } }; static int __init masquerade_tg_init(void) { int ret; - ret = xt_register_target(&masquerade_tg_reg); + ret = xt_register_targets(masquerade_tg_reg, + ARRAY_SIZE(masquerade_tg_reg)); if (ret) return ret; - ret = nf_nat_masquerade_ipv4_register_notifier(); - if (ret) - xt_unregister_target(&masquerade_tg_reg); + ret = nf_nat_masquerade_inet_register_notifiers(); + if (ret) { + xt_unregister_targets(masquerade_tg_reg, + ARRAY_SIZE(masquerade_tg_reg)); + return ret; + } return ret; } static void __exit masquerade_tg_exit(void) { - xt_unregister_target(&masquerade_tg_reg); - nf_nat_masquerade_ipv4_unregister_notifier(); + xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); + nf_nat_masquerade_inet_unregister_notifiers(); } module_init(masquerade_tg_init); module_exit(masquerade_tg_exit); +#if IS_ENABLED(CONFIG_IPV6) +MODULE_ALIAS("ip6t_MASQUERADE"); +#endif +MODULE_ALIAS("ipt_MASQUERADE"); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index c13bcd0ab491..8dbb4d48f2ed 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) s64 stamp; /* - * We cannot use get_seconds() instead of __net_timestamp() here. + * We need real time here, but we can neither use skb->tstamp + * nor __net_timestamp(). + * + * skb->tstamp and skb->skb_mstamp_ns overlap, however, they + * use different clock types (real vs monotonic). + * * Suppose you have two rules: - * 1. match before 13:00 - * 2. match after 13:00 + * 1. match before 13:00 + * 2. match after 13:00 + * * If you match against processing time (get_seconds) it * may happen that the same packet matches both rules if - * it arrived at the right moment before 13:00. + * it arrived at the right moment before 13:00, so it would be + * better to check skb->tstamp and set it via __net_timestamp() + * if needed. This however breaks outgoing packets tx timestamp, + * and causes them to get delayed forever by fq packet scheduler. */ - if (skb->tstamp == 0) - __net_timestamp((struct sk_buff *)skb); - - stamp = ktime_to_ns(skb->tstamp); - stamp = div_s64(stamp, NSEC_PER_SEC); + stamp = get_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f28e937320a3..216ab915dd54 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -988,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; - unsigned long groups = nladdr->nl_groups; + unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) @@ -996,6 +996,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nladdr->nl_family != AF_NETLINK) return -EINVAL; + groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1d3144d19903..167c09e1ea90 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1199,7 +1199,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; - int ret; switch (cmd) { case TIOCOUTQ: { @@ -1225,18 +1224,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return put_user(amount, (int __user *)argp); } - case SIOCGSTAMP: - lock_sock(sk); - ret = sock_get_timestamp(sk, argp); - release_sock(sk); - return ret; - - case SIOCGSTAMPNS: - lock_sock(sk); - ret = sock_get_timestampns(sk, argp); - release_sock(sk); - return ret; - case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1362,6 +1349,7 @@ static const struct proto_ops nr_proto_ops = { .getname = nr_getname, .poll = datagram_poll, .ioctl = nr_ioctl, + .gettstamp = sock_gettstamp, .listen = nr_listen, .shutdown = sock_no_shutdown, .setsockopt = nr_setsockopt, @@ -1392,18 +1380,22 @@ static int __init nr_proto_init(void) int i; int rc = proto_register(&nr_proto, 0); - if (rc != 0) - goto out; + if (rc) + return rc; if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n"); - return -1; + pr_err("NET/ROM: %s - nr_ndevs parameter too large\n", + __func__); + rc = -EINVAL; + goto unregister_proto; } dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); - if (dev_nr == NULL) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n"); - return -1; + if (!dev_nr) { + pr_err("NET/ROM: %s - unable to allocate device array\n", + __func__); + rc = -ENOMEM; + goto unregister_proto; } for (i = 0; i < nr_ndevs; i++) { @@ -1413,13 +1405,13 @@ static int __init nr_proto_init(void) sprintf(name, "nr%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); + rc = -ENOMEM; goto fail; } dev->base_addr = i; - if (register_netdev(dev)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n"); + rc = register_netdev(dev); + if (rc) { free_netdev(dev); goto fail; } @@ -1427,36 +1419,64 @@ static int __init nr_proto_init(void) dev_nr[i] = dev; } - if (sock_register(&nr_family_ops)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n"); + rc = sock_register(&nr_family_ops); + if (rc) goto fail; - } - register_netdevice_notifier(&nr_dev_notifier); + rc = register_netdevice_notifier(&nr_dev_notifier); + if (rc) + goto out_sock; ax25_register_pid(&nr_pid); ax25_linkfail_register(&nr_linkfail_notifier); #ifdef CONFIG_SYSCTL - nr_register_sysctl(); + rc = nr_register_sysctl(); + if (rc) + goto out_sysctl; #endif nr_loopback_init(); - proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops); - proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops); - proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops); -out: - return rc; + rc = -ENOMEM; + if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops)) + goto proc_remove1; + if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net, + &nr_neigh_seqops)) + goto proc_remove2; + if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net, + &nr_node_seqops)) + goto proc_remove3; + + return 0; + +proc_remove3: + remove_proc_entry("nr_neigh", init_net.proc_net); +proc_remove2: + remove_proc_entry("nr", init_net.proc_net); +proc_remove1: + + nr_loopback_clear(); + nr_rt_free(); + +#ifdef CONFIG_SYSCTL + nr_unregister_sysctl(); +out_sysctl: +#endif + ax25_linkfail_release(&nr_linkfail_notifier); + ax25_protocol_release(AX25_P_NETROM); + unregister_netdevice_notifier(&nr_dev_notifier); +out_sock: + sock_unregister(PF_NETROM); fail: while (--i >= 0) { unregister_netdev(dev_nr[i]); free_netdev(dev_nr[i]); } kfree(dev_nr); +unregister_proto: proto_unregister(&nr_proto); - rc = -1; - goto out; + return rc; } module_init(nr_proto_init); diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 215ad22a9647..93d13f019981 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -70,7 +70,7 @@ static void nr_loopback_timer(struct timer_list *unused) } } -void __exit nr_loopback_clear(void) +void nr_loopback_clear(void) { del_timer_sync(&loopback_timer); skb_queue_purge(&loopback_queue); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 6485f593e2f0..b76aa668a94b 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -953,7 +953,7 @@ const struct seq_operations nr_neigh_seqops = { /* * Free all memory associated with the nodes and routes lists. */ -void __exit nr_rt_free(void) +void nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index ba1c368b3f18..771011b84270 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -146,9 +146,12 @@ static struct ctl_table nr_table[] = { { } }; -void __init nr_register_sysctl(void) +int __init nr_register_sysctl(void) { nr_table_header = register_net_sysctl(&init_net, "net/netrom", nr_table); + if (!nr_table_header) + return -ENOMEM; + return 0; } void nr_unregister_sysctl(void) diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index ddfc52ac1f9b..c0d323b58e73 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -312,6 +312,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; + if (new_pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id @@ -336,6 +340,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0be3ab5bde26..626629944450 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -29,7 +29,7 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/ipv6_frag.h> -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #endif @@ -75,7 +75,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; char timeout[CTNL_TIMEOUT_NAME_MAX]; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; @@ -721,7 +721,7 @@ static bool skb_nfct_cached(struct net *net, return ct_executed; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. * Returns either NF_ACCEPT or NF_DROP. @@ -903,7 +903,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, return err; } -#else /* !CONFIG_NF_NAT_NEEDED */ +#else /* !CONFIG_NF_NAT */ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb, struct nf_conn *ct, @@ -1330,7 +1330,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return 0; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static int parse_nat(const struct nlattr *attr, struct ovs_conntrack_info *info, bool log) { @@ -1467,7 +1467,7 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, .maxlen = NF_CT_HELPER_NAME_LEN }, -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) /* NAT length is checked when parsing the nested attributes. */ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, #endif @@ -1547,7 +1547,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return -EINVAL; } break; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) case OVS_CT_ATTR_NAT: { int err = parse_nat(a, info, log); @@ -1677,7 +1677,7 @@ err_free_ct: return err; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, struct sk_buff *skb) { @@ -1783,7 +1783,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, return -EMSGSIZE; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) return -EMSGSIZE; #endif diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 08fe8b79c0bf..5c4a118d6f96 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4075,11 +4075,6 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: @@ -4455,6 +4450,7 @@ static const struct proto_ops packet_ops_spkt = { .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, @@ -4476,6 +4472,7 @@ static const struct proto_ops packet_ops = { .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index b37e6e0a1026..7c5e8292cc0a 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -968,9 +968,6 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } break; - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: @@ -1033,6 +1030,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, + .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index d6cc97fbbbb0..2b969f99ef13 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -543,6 +543,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; + lock_sock(sk); switch (uaddr->sa_family) { diff --git a/net/rds/bind.c b/net/rds/bind.c index 17c9d9f0c848..0f4398e7f2a7 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -173,6 +173,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 31cf37da4510..93c0437e6a5f 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -44,6 +44,17 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) else pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } + ibmr = rds_ib_try_reuse_ibmr(pool); if (ibmr) return ibmr; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 63c8d107adcf..d664e9ade74d 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -454,9 +454,6 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) struct rds_ib_mr *ibmr = NULL; int iter = 0; - if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) - queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); - while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index c96f63ffe31e..e274bc6e1458 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1301,12 +1301,6 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return put_user(amount, (unsigned int __user *) argp); } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *) argp); - - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *) argp); - case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1474,6 +1468,7 @@ static const struct proto_ops rose_proto_ops = { .getname = rose_getname, .poll = datagram_poll, .ioctl = rose_ioctl, + .gettstamp = sock_gettstamp, .listen = rose_listen, .shutdown = sock_no_shutdown, .setsockopt = rose_setsockopt, diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 7af4f99c4a93..094a6621f8e8 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -16,6 +16,7 @@ #include <linux/init.h> static struct sk_buff_head loopback_queue; +#define ROSE_LOOPBACK_LIMIT 1000 static struct timer_list loopback_timer; static void rose_set_loopback_timer(void); @@ -35,29 +36,27 @@ static int rose_loopback_running(void) int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) { - struct sk_buff *skbn; + struct sk_buff *skbn = NULL; - skbn = skb_clone(skb, GFP_ATOMIC); + if (skb_queue_len(&loopback_queue) < ROSE_LOOPBACK_LIMIT) + skbn = skb_clone(skb, GFP_ATOMIC); - kfree_skb(skb); - - if (skbn != NULL) { + if (skbn) { + consume_skb(skb); skb_queue_tail(&loopback_queue, skbn); if (!rose_loopback_running()) rose_set_loopback_timer(); + } else { + kfree_skb(skb); } return 1; } - static void rose_set_loopback_timer(void) { - del_timer(&loopback_timer); - - loopback_timer.expires = jiffies + 10; - add_timer(&loopback_timer); + mod_timer(&loopback_timer, jiffies + 10); } static void rose_loopback_timer(struct timer_list *unused) @@ -68,8 +67,12 @@ static void rose_loopback_timer(struct timer_list *unused) struct sock *sk; unsigned short frametype; unsigned int lci_i, lci_o; + int count; - while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + for (count = 0; count < ROSE_LOOPBACK_LIMIT; count++) { + skb = skb_dequeue(&loopback_queue); + if (!skb) + return; if (skb->len < ROSE_MIN_LEN) { kfree_skb(skb); continue; @@ -106,6 +109,8 @@ static void rose_loopback_timer(struct timer_list *unused) kfree_skb(skb); } } + if (!skb_queue_empty(&loopback_queue)) + mod_timer(&loopback_timer, jiffies + 1); } void __exit rose_loopback_clear(void) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 96f2952bbdfd..ae8c5d7f3bf1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -135,7 +135,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - u16 service_id = srx->srx_service; + u16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -143,6 +143,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) ret = rxrpc_validate_address(rx, srx, len); if (ret < 0) goto error; + service_id = srx->srx_service; lock_sock(&rx->sk); @@ -370,18 +371,22 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check + * @_life: Where to store the life value * * Allow a kernel service to find out whether a call is still alive - ie. we're - * getting ACKs from the server. Returns a number representing the life state - * which can be compared to that returned by a previous call. + * getting ACKs from the server. Passes back in *_life a number representing + * the life state which can be compared to that returned by a previous call and + * return true if the call is still alive. * * If the life state stalls, rxrpc_kernel_probe_life() should be called and * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(const struct socket *sock, - const struct rxrpc_call *call) +bool rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call, + u32 *_life) { - return call->acks_latest; + *_life = call->acks_latest; + return call->state != RXRPC_CALL_COMPLETE; } EXPORT_SYMBOL(rxrpc_kernel_check_life); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4b1a534d290a..062ca9dc29b8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -654,6 +654,7 @@ struct rxrpc_call { u8 ackr_reason; /* reason to ACK */ u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ + rxrpc_serial_t ackr_first_seq; /* first sequence number received */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b6fca8ebb117..8d31fb4c51e1 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -153,7 +153,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, * pass a connection-level abort onto all calls on that connection */ static void rxrpc_abort_calls(struct rxrpc_connection *conn, - enum rxrpc_call_completion compl) + enum rxrpc_call_completion compl, + rxrpc_serial_t serial) { struct rxrpc_call *call; int i; @@ -173,6 +174,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, call->call_id, 0, conn->abort_code, conn->error); + else + trace_rxrpc_rx_abort(call, serial, + conn->abort_code); if (rxrpc_set_call_completion(call, compl, conn->abort_code, conn->error)) @@ -213,8 +217,6 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, conn->state = RXRPC_CONN_LOCALLY_ABORTED; spin_unlock_bh(&conn->state_lock); - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED); - msg.msg_name = &conn->params.peer->srx.transport; msg.msg_namelen = conn->params.peer->srx.transport_len; msg.msg_control = NULL; @@ -242,6 +244,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, len = iov[0].iov_len + iov[1].iov_len; serial = atomic_inc_return(&conn->serial); + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); whdr.serial = htonl(serial); _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code); @@ -321,7 +324,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, conn->error = -ECONNABORTED; conn->abort_code = abort_code; conn->state = RXRPC_CONN_REMOTELY_ABORTED; - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED); + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9128aa0e40aa..c2c35cf4e308 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -837,7 +837,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, u8 acks[RXRPC_MAXACKS]; } buf; rxrpc_serial_t acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack; + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); @@ -851,13 +851,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, acked_serial = ntohl(buf.ack.serial); first_soft_ack = ntohl(buf.ack.firstPacket); + prev_pkt = ntohl(buf.ack.previousPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? buf.ack.reason : RXRPC_ACK__INVALID); trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial, - first_soft_ack, ntohl(buf.ack.previousPacket), + first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) @@ -878,8 +879,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ack_respond_to_ack); } - /* Discard any out-of-order or duplicate ACKs. */ - if (before_eq(sp->hdr.serial, call->acks_latest)) + /* Discard any out-of-order or duplicate ACKs (outside lock). */ + if (before(first_soft_ack, call->ackr_first_seq) || + before(prev_pkt, call->ackr_prev_seq)) return; buf.info.rxMTU = 0; @@ -890,12 +892,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, spin_lock(&call->input_lock); - /* Discard any out-of-order or duplicate ACKs. */ - if (before_eq(sp->hdr.serial, call->acks_latest)) + /* Discard any out-of-order or duplicate ACKs (inside lock). */ + if (before(first_soft_ack, call->ackr_first_seq) || + before(prev_pkt, call->ackr_prev_seq)) goto out; call->acks_latest_ts = skb->tstamp; call->acks_latest = sp->hdr.serial; + call->ackr_first_seq = first_soft_ack; + call->ackr_prev_seq = prev_pkt; + /* Parse rwind and mtu sizes if provided. */ if (buf.info.rxMTU) rxrpc_input_ackinfo(call, skb, &buf.info); @@ -1155,19 +1161,19 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * handle data received on the local endpoint * - may be called in interrupt context * - * The socket is locked by the caller and this prevents the socket from being - * shut down and the local endpoint from going away, thus sk_user_data will not - * be cleared until this function returns. + * [!] Note that as this is called from the encap_rcv hook, the socket is not + * held locked by the caller and nothing prevents sk_user_data on the UDP from + * being cleared in the middle of processing this function. * * Called with the RCU read lock held from the IP layer via UDP. */ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) { + struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct rxrpc_connection *conn; struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; - struct rxrpc_local *local = udp_sk->sk_user_data; struct rxrpc_peer *peer = NULL; struct rxrpc_sock *rx = NULL; unsigned int channel; @@ -1175,6 +1181,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) _enter("%p", udp_sk); + if (unlikely(!local)) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 9157fd00dce3..b67dec945498 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -304,7 +304,8 @@ nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); - kfree(local); + if (local) + call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index bc05af89fc38..6e84d878053c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -157,6 +157,11 @@ void rxrpc_error_report(struct sock *sk) _enter("%p{%d}", sk, local->debug_id); + /* Clear the outstanding error value on the socket so that it doesn't + * cause kernel_sendmsg() to return it later. + */ + sock_error(sk); + skb = sock_dequeue_err_skb(sk); if (!skb) { _leave("UDP socket errqueue empty"); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 46c9312085b1..bec64deb7b0a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -152,12 +152,13 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, } /* - * Queue a DATA packet for transmission, set the resend timeout and send the - * packet immediately + * Queue a DATA packet for transmission, set the resend timeout and send + * the packet immediately. Returns the error from rxrpc_send_data_packet() + * in case the caller wants to do something with it. */ -static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, - struct sk_buff *skb, bool last, - rxrpc_notify_end_tx_t notify_end_tx) +static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, + struct sk_buff *skb, bool last, + rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long now; @@ -250,7 +251,8 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, out: rxrpc_free_skb(skb, rxrpc_skb_tx_freed); - _leave(""); + _leave(" = %d", ret); + return ret; } /* @@ -423,9 +425,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (ret < 0) goto out; - rxrpc_queue_packet(rx, call, skb, - !msg_data_left(msg) && !more, - notify_end_tx); + ret = rxrpc_queue_packet(rx, call, skb, + !msg_data_left(msg) && !more, + notify_end_tx); + /* Should check for failure here */ skb = NULL; } } while (msg_data_left(msg) > 0); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2763176e369c..0d8968803e98 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -90,6 +90,7 @@ struct cls_fl_head { struct rhashtable ht; spinlock_t masks_lock; /* Protect masks list */ struct list_head masks; + struct list_head hw_filters; struct rcu_work rwork; struct idr handle_idr; }; @@ -102,6 +103,7 @@ struct cls_fl_filter { struct tcf_result res; struct fl_flow_key key; struct list_head list; + struct list_head hw_list; u32 handle; u32 flags; u32 in_hw_count; @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp) spin_lock_init(&head->masks_lock); INIT_LIST_HEAD_RCU(&head->masks); + INIT_LIST_HEAD(&head->hw_filters); rcu_assign_pointer(tp->root, head); idr_init(&head->handle_idr); @@ -336,8 +339,7 @@ static void fl_mask_free_work(struct work_struct *work) fl_mask_free(mask); } -static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, - bool async) +static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask) { if (!refcount_dec_and_test(&mask->refcnt)) return false; @@ -348,14 +350,21 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, list_del_rcu(&mask->list); spin_unlock(&head->masks_lock); - if (async) - tcf_queue_work(&mask->rwork, fl_mask_free_work); - else - fl_mask_free(mask); + tcf_queue_work(&mask->rwork, fl_mask_free_work); return true; } +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) +{ + /* Flower classifier only changes root pointer during init and destroy. + * Users must obtain reference to tcf_proto instance before calling its + * API, so tp->root pointer is protected from concurrent call to + * fl_destroy() by reference counting. + */ + return rcu_dereference_raw(tp->root); +} + static void __fl_destroy_filter(struct cls_fl_filter *f) { tcf_exts_destroy(&f->exts); @@ -386,6 +395,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); spin_lock(&tp->lock); + list_del_init(&f->hw_list); tcf_block_offload_dec(block, &f->flags); spin_unlock(&tp->lock); @@ -397,6 +407,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool rtnl_held, struct netlink_ext_ack *extack) { + struct cls_fl_head *head = fl_head_dereference(tp); struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(f->flags); @@ -448,6 +459,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, goto errout; } + spin_lock(&tp->lock); + list_add(&f->hw_list, &head->hw_filters); + spin_unlock(&tp->lock); errout: if (!rtnl_held) rtnl_unlock(); @@ -479,23 +493,11 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, rtnl_unlock(); } -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) -{ - /* Flower classifier only changes root pointer during init and destroy. - * Users must obtain reference to tcf_proto instance before calling its - * API, so tp->root pointer is protected from concurrent call to - * fl_destroy() by reference counting. - */ - return rcu_dereference_raw(tp->root); -} - static void __fl_put(struct cls_fl_filter *f) { if (!refcount_dec_and_test(&f->refcnt)) return; - WARN_ON(!f->deleted); - if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, fl_destroy_filter_work); else @@ -538,7 +540,6 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); - bool async = tcf_exts_get_net(&f->exts); *last = false; @@ -555,7 +556,7 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, list_del_rcu(&f->list); spin_unlock(&tp->lock); - *last = fl_mask_put(head, f->mask, async); + *last = fl_mask_put(head, f->mask); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f, rtnl_held, extack); tcf_unbind_filter(tp, &f->res); @@ -1466,9 +1467,9 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew, struct fl_flow_mask *mask = fnew->mask; int err; - err = rhashtable_insert_fast(&mask->ht, - &fnew->ht_node, - mask->filter_ht_params); + err = rhashtable_lookup_insert_fast(&mask->ht, + &fnew->ht_node, + mask->filter_ht_params); if (err) { *in_ht = false; /* It is okay if filter with same key exists when @@ -1527,6 +1528,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, err = -ENOBUFS; goto errout_tb; } + INIT_LIST_HEAD(&fnew->hw_list); refcount_set(&fnew->refcnt, 1); err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); @@ -1574,7 +1576,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout_hw; } - refcount_inc(&fnew->refcnt); if (fold) { /* Fold filter was deleted concurrently. Retry lookup. */ if (fold->deleted) { @@ -1596,6 +1597,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, in_ht = true; } + refcount_inc(&fnew->refcnt); rhashtable_remove_fast(&fold->mask->ht, &fold->ht_node, fold->mask->filter_ht_params); @@ -1605,11 +1607,10 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, spin_unlock(&tp->lock); - fl_mask_put(head, fold->mask, true); + fl_mask_put(head, fold->mask); if (!tc_skip_hw(fold->flags)) fl_hw_destroy_filter(tp, fold, rtnl_held, NULL); tcf_unbind_filter(tp, &fold->res); - tcf_exts_get_net(&fold->exts); /* Caller holds reference to fold, so refcnt is always > 0 * after this. */ @@ -1637,6 +1638,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_hw; + refcount_inc(&fnew->refcnt); fnew->handle = handle; list_add_tail_rcu(&fnew->list, &fnew->mask->filters); spin_unlock(&tp->lock); @@ -1648,18 +1650,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(mask); return 0; +errout_ht: + spin_lock(&tp->lock); errout_hw: + fnew->deleted = true; spin_unlock(&tp->lock); if (!tc_skip_hw(fnew->flags)) fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); -errout_ht: if (in_ht) rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, fnew->mask->filter_ht_params); errout_mask: - fl_mask_put(head, fnew->mask, true); + fl_mask_put(head, fnew->mask); errout: - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work); + __fl_put(fnew); errout_tb: kfree(tb); errout_mask_alloc: @@ -1704,19 +1708,46 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, } } +static struct cls_fl_filter * +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + if (list_empty(&head->hw_filters)) { + spin_unlock(&tp->lock); + return NULL; + } + + if (!f) + f = list_entry(&head->hw_filters, struct cls_fl_filter, + hw_list); + list_for_each_entry_continue(f, &head->hw_filters, hw_list) { + if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) { + spin_unlock(&tp->lock); + return f; + } + } + + spin_unlock(&tp->lock); + return NULL; +} + static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; - unsigned long handle = 0; - struct cls_fl_filter *f; + struct cls_fl_filter *f = NULL; int err; - while ((f = fl_get_next_filter(tp, &handle))) { - if (tc_skip_hw(f->flags)) - goto next_flow; + /* hw_filters list can only be changed by hw offload functions after + * obtaining rtnl lock. Make sure it is not changed while reoffload is + * iterating it. + */ + ASSERT_RTNL(); + while ((f = fl_get_next_hw_filter(tp, f, add))) { cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts)); if (!cls_flower.rule) { @@ -1762,7 +1793,6 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, add); spin_unlock(&tp->lock); next_flow: - handle++; __fl_put(f); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index fb8f138b9776..c126b9f78d6e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -998,6 +998,19 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, qdisc_put(old); } +static void qdisc_clear_nolock(struct Qdisc *sch) +{ + sch->flags &= ~TCQ_F_NOLOCK; + if (!(sch->flags & TCQ_F_CPUSTATS)) + return; + + free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); + sch->cpu_bstats = NULL; + sch->cpu_qstats = NULL; + sch->flags &= ~TCQ_F_CPUSTATS; +} + /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * @@ -1076,7 +1089,7 @@ skip: /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && parent && !(parent->flags & TCQ_F_NOLOCK)) - new->flags &= ~TCQ_F_NOLOCK; + qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index c6a502933fe7..f68fd7a0e038 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -61,16 +61,20 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <net/netevent.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +static LIST_HEAD(cbs_list); +static DEFINE_SPINLOCK(cbs_list_lock); + #define BYTES_PER_KBIT (1000LL / 8) struct cbs_sched_data { bool offload; int queue; - s64 port_rate; /* in bytes/s */ + atomic64_t port_rate; /* in bytes/s */ s64 last; /* timestamp in ns */ s64 credits; /* in bytes */ s32 locredit; /* in bytes */ @@ -82,6 +86,7 @@ struct cbs_sched_data { struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); struct Qdisc *qdisc; + struct list_head cbs_list; }; static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -181,6 +186,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + if (atomic64_read(&q->port_rate) == -1) { + WARN_ONCE(1, "cbs: dequeue() called with unknown port rate."); + return NULL; + } + if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -207,7 +217,8 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) /* As sendslope is a negative number, this will decrease the * amount of q->credits. */ - credits = credits_from_len(len, q->sendslope, q->port_rate); + credits = credits_from_len(len, q->sendslope, + atomic64_read(&q->port_rate)); credits += q->credits; q->credits = max_t(s64, credits, q->locredit); @@ -294,6 +305,50 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, return 0; } +static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) +{ + struct ethtool_link_ksettings ecmd; + int port_rate = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT; + + atomic64_set(&q->port_rate, port_rate); + netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->port_rate), + ecmd.base.speed); +} + +static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct cbs_sched_data *q; + struct net_device *qdev; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&cbs_list_lock); + list_for_each_entry(q, &cbs_list, cbs_list) { + qdev = qdisc_dev(q->qdisc); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&cbs_list_lock); + + if (found) + cbs_set_port_rate(dev, q); + + return NOTIFY_DONE; +} + static int cbs_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -315,16 +370,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_CBS_PARMS]); if (!qopt->offload) { - struct ethtool_link_ksettings ecmd; - s64 link_speed; - - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; - - q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; - + cbs_set_port_rate(dev, q); cbs_disable_offload(dev, q); } else { err = cbs_enable_offload(dev, q, qopt, extack); @@ -347,6 +393,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -367,7 +414,17 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - return cbs_change(sch, opt, extack); + err = cbs_change(sch, opt, extack); + if (err) + return err; + + if (!q->offload) { + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + } + + return 0; } static void cbs_destroy(struct Qdisc *sch) @@ -375,8 +432,11 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - qdisc_watchdog_cancel(&q->watchdog); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); if (q->qdisc) @@ -487,14 +547,24 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block cbs_device_notifier = { + .notifier_call = cbs_dev_notifier, +}; + static int __init cbs_module_init(void) { + int err = register_netdevice_notifier(&cbs_device_notifier); + + if (err) + return err; + return register_qdisc(&cbs_qdisc_ops); } static void __exit cbs_module_exit(void) { unregister_qdisc(&cbs_qdisc_ops); + unregister_netdevice_notifier(&cbs_device_notifier); } module_init(cbs_module_init) module_exit(cbs_module_exit) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 81356ef38d1d..848aab3693bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -68,7 +68,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -108,7 +108,7 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; @@ -118,52 +118,36 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, spin_unlock(lock); } -static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - while (skb) { - struct sk_buff *next = skb->next; - - __skb_queue_tail(&q->gso_skb, skb); - q->qstats.requeues++; - qdisc_qstats_backlog_inc(q, skb); - q->q.qlen++; /* it's still part of the queue */ + spinlock_t *lock = NULL; - skb = next; + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); } - __netif_schedule(q); - - return 0; -} -static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) -{ - spinlock_t *lock = qdisc_lock(q); - - spin_lock(lock); while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); - qdisc_qstats_cpu_requeues_inc(q); - qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + /* it's still part of the queue */ + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); + q->q.qlen++; + } skb = next; } - spin_unlock(lock); - + if (lock) + spin_unlock(lock); __netif_schedule(q); - - return 0; -} - -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) -{ - if (q->flags & TCQ_F_NOLOCK) - return dev_requeue_skb_locked(skb, q); - else - return __dev_requeue_skb(skb, q); } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -252,7 +236,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -645,11 +629,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (unlikely(err)) return qdisc_drop_cpu(skb, qdisc, to_free); - qdisc_qstats_atomic_qlen_inc(qdisc); - /* Note: skb can not be used after skb_array_produce(), - * so we better not use qdisc_qstats_cpu_backlog_inc() - */ - this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len); + qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } @@ -668,9 +648,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) skb = __skb_array_consume(q); } if (likely(skb)) { - qdisc_qstats_cpu_backlog_dec(qdisc, skb); - qdisc_bstats_cpu_update(qdisc, skb); - qdisc_qstats_atomic_qlen_dec(qdisc); + qdisc_update_stats_at_dequeue(qdisc, skb); } else { qdisc->empty = true; } @@ -716,6 +694,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; + q->qlen = 0; } } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c7041999eb5d..df848a36b222 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -13,6 +13,7 @@ #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netlink.h> @@ -20,6 +21,9 @@ #include <net/pkt_cls.h> #include <net/sch_generic.h> +static LIST_HEAD(taprio_list); +static DEFINE_SPINLOCK(taprio_list_lock); + #define TAPRIO_ALL_GATES_OPEN -1 struct sched_entry { @@ -42,9 +46,9 @@ struct taprio_sched { struct Qdisc *root; s64 base_time; int clockid; - int picos_per_byte; /* Using picoseconds because for 10Gbps+ - * speeds it's sub-nanoseconds per byte - */ + atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ size_t num_entries; /* Protects the update side of the RCU protected current_entry */ @@ -53,6 +57,7 @@ struct taprio_sched { struct list_head entries; ktime_t (*get_time)(void); struct hrtimer advance_timer; + struct list_head taprio_list; }; static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -85,7 +90,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) rcu_read_lock(); entry = rcu_dereference(q->current_entry); - gate_mask = entry ? entry->gate_mask : -1; + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; rcu_read_unlock(); if (!gate_mask) @@ -107,7 +112,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) - return NULL; + continue; return skb; } @@ -117,7 +122,14 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) static inline int length_to_duration(struct taprio_sched *q, int len) { - return (len * q->picos_per_byte) / 1000; + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) +{ + atomic_set(&entry->budget, + div64_u64((u64)entry->interval * 1000, + atomic64_read(&q->picos_per_byte))); } static struct sk_buff *taprio_dequeue(struct Qdisc *sch) @@ -129,6 +141,11 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) u32 gate_mask; int i; + if (atomic64_read(&q->picos_per_byte) == -1) { + WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte."); + return NULL; + } + rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't @@ -171,12 +188,12 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && ktime_after(guard, entry->close_time)) - return NULL; + continue; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && atomic_sub_return(len, &entry->budget) < 0) - return NULL; + continue; skb = child->ops->dequeue(child); if (unlikely(!skb)) @@ -192,14 +209,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) return NULL; } -static bool should_restart_cycle(const struct taprio_sched *q, - const struct sched_entry *entry) -{ - WARN_ON(!entry); - - return list_is_last(&entry->list, &q->entries); -} - static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, @@ -223,7 +232,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) goto first_run; } - if (should_restart_cycle(q, entry)) + if (list_is_last(&entry->list, &q->entries)) next = list_first_entry(&q->entries, struct sched_entry, list); else @@ -232,8 +241,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) close_time = ktime_add_ns(entry->close_time, next->interval); next->close_time = close_time; - atomic_set(&next->budget, - (next->interval * 1000) / q->picos_per_byte); + taprio_set_budget(q, next); first_run: rcu_assign_pointer(q->current_entry, next); @@ -523,7 +531,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return 0; } -static ktime_t taprio_get_start_time(struct Qdisc *sch) +static int taprio_get_start_time(struct Qdisc *sch, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); struct sched_entry *entry; @@ -531,27 +539,33 @@ static ktime_t taprio_get_start_time(struct Qdisc *sch) s64 n; base = ns_to_ktime(q->base_time); - cycle = 0; + now = q->get_time(); + + if (ktime_after(base, now)) { + *start = base; + return 0; + } /* Calculate the cycle_time, by summing all the intervals. */ + cycle = 0; list_for_each_entry(entry, &q->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) - return base; - - now = q->get_time(); - - if (ktime_after(base, now)) - return base; + /* The qdisc is expected to have at least one sched_entry. Moreover, + * any entry must have 'interval' > 0. Thus if the cycle time is zero, + * something went really wrong. In that case, we should warn about this + * inconsistent state and return error. + */ + if (WARN_ON(!cycle)) + return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); - - return ktime_add_ns(base, (n + 1) * cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; } static void taprio_start_sched(struct Qdisc *sch, ktime_t start) @@ -566,8 +580,7 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start) list); first->close_time = ktime_add_ns(start, first->interval); - atomic_set(&first->budget, - (first->interval * 1000) / q->picos_per_byte); + taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); spin_unlock_irqrestore(&q->current_entry_lock, flags); @@ -575,6 +588,52 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start) hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } +static void taprio_set_picos_per_byte(struct net_device *dev, + struct taprio_sched *q) +{ + struct ethtool_link_ksettings ecmd; + int picos_per_byte = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + ecmd.base.speed * 1000 * 1000); + + atomic64_set(&q->picos_per_byte, picos_per_byte); + netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->picos_per_byte), + ecmd.base.speed); +} + +static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_device *qdev; + struct taprio_sched *q; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&taprio_list_lock); + list_for_each_entry(q, &taprio_list, taprio_list) { + qdev = qdisc_dev(q->root); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&taprio_list_lock); + + if (found) + taprio_set_picos_per_byte(dev, q); + + return NOTIFY_DONE; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -582,9 +641,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - struct ethtool_link_ksettings ecmd; int i, err, size; - s64 link_speed; ktime_t start; err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, @@ -592,7 +649,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; - err = -EINVAL; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); @@ -657,17 +713,13 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, mqprio->prio_tc_map[i]); } - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; - - q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - link_speed * 1000 * 1000); + taprio_set_picos_per_byte(dev, q); - start = taprio_get_start_time(sch); - if (!start) - return 0; + err = taprio_get_start_time(sch, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); + return err; + } taprio_start_sched(sch, start); @@ -681,6 +733,10 @@ static void taprio_destroy(struct Qdisc *sch) struct sched_entry *entry, *n; unsigned int i; + spin_lock(&taprio_list_lock); + list_del(&q->taprio_list); + spin_unlock(&taprio_list_lock); + hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { @@ -735,6 +791,10 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; + spin_lock(&taprio_list_lock); + list_add(&q->taprio_list, &taprio_list); + spin_unlock(&taprio_list_lock); + return taprio_change(sch, opt, extack); } @@ -947,14 +1007,24 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block taprio_device_notifier = { + .notifier_call = taprio_dev_notifier, +}; + static int __init taprio_module_init(void) { + int err = register_netdevice_notifier(&taprio_device_notifier); + + if (err) + return err; + return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); + unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 6200cd2b4b99..188c47eb206e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1030,6 +1030,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 951afdeea5e9..f0631bf486b6 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1026,6 +1026,7 @@ static const struct proto_ops inet_seqpacket_ops = { .getname = inet_getname, /* Semantics are different. */ .poll = sctp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c9ae3404b1bb..7dfc34b28f4f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6412,13 +6412,15 @@ static int sctp_eat_data(const struct sctp_association *asoc, * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our * memory usage too much */ - if (*sk->sk_prot_creator->memory_pressure) { + if (sk_under_memory_pressure(sk)) { if (sctp_tsnmap_has_gap(map) && (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { pr_debug("%s: under pressure, reneging for tsn:%u\n", __func__, tsn); deliver = SCTP_CMD_RENEGE; - } + } else { + sk_mem_reclaim(sk); + } } /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9874e60c9b0d..e4e892cc5644 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1913,7 +1913,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - if (sctp_wspace(asoc) <= 0) { + if (sk_under_memory_pressure(sk)) + sk_mem_reclaim(sk); + + if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) @@ -4847,7 +4850,8 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, } /* Validate addr_len before calling common connect/connectx routine. */ - af = sctp_get_af_specific(addr->sa_family); + af = addr_len < offsetofend(struct sockaddr, sa_family) ? NULL : + sctp_get_af_specific(addr->sa_family); if (!af || addr_len < af->sockaddr_len) { err = -EINVAL; } else { @@ -8930,7 +8934,10 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, goto do_error; if (signal_pending(current)) goto do_interrupted; - if ((int)msg_len <= sctp_wspace(asoc)) + if (sk_under_memory_pressure(sk)) + sk_mem_reclaim(sk); + if ((int)msg_len <= sctp_wspace(asoc) && + sk_wmem_schedule(sk, msg_len)) break; /* Let another process have a go. Since we are going diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 102c6fefe38c..25e0b7e5189c 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -484,14 +484,15 @@ static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sk_buff_head *skb_list) { - struct sk_buff *skb = sctp_event2skb(event); struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *skb_list; + struct sctp_ulpevent *event; + struct sk_buff *skb; - skb_list = (struct sk_buff_head *)skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || @@ -858,19 +859,24 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); - if (event && event->msg_flags & MSG_EOR) { + if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_intl_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } return event_eor; @@ -944,20 +950,27 @@ out: static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; + struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } } @@ -1059,7 +1072,7 @@ static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { sctp_intl_retrieve_ordered(ulpq, event); - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } } @@ -1298,6 +1311,15 @@ static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) ntohl(skip->mid), skip->flags); } +static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_ulpq_tail_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), @@ -1306,7 +1328,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, - .enqueue_event = sctp_ulpq_tail_event, + .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, @@ -1317,6 +1339,16 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .handle_ftsn = sctp_handle_fwdtsn, }; +static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_enqueue_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), @@ -1325,7 +1357,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = { .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, - .enqueue_event = sctp_enqueue_event, + .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8cb7d9858270..c2a7478587ab 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -634,8 +634,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event = NULL; - struct sk_buff *skb; - size_t padding, len; + struct sk_buff *skb = chunk->skb; + struct sock *sk = asoc->base.sk; + size_t padding, datalen; int rx_count; /* @@ -646,15 +647,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + rx_count = atomic_read(&sk->sk_rmem_alloc); - if (rx_count >= asoc->base.sk->sk_rcvbuf) { + datalen = ntohs(chunk->chunk_hdr->length); - if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb, - chunk->skb->truesize))) - goto fail; - } + if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen)) + goto fail; /* Clone the original skb, sharing the data. */ skb = skb_clone(chunk->skb, gfp); @@ -681,8 +679,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * The sender should never pad with more than 3 bytes. The receiver * MUST ignore the padding bytes. */ - len = ntohs(chunk->chunk_hdr->length); - padding = SCTP_PAD4(len) - len; + padding = SCTP_PAD4(datalen) - datalen; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 5dde92101743..a212fe079c07 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -116,12 +116,13 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { + if (event) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_ulpq_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_ulpq_order(ulpq, event); } /* Send event to the ULP. 'event' is the sctp_ulpevent for @@ -129,7 +130,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, */ if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } return event_eor; @@ -193,18 +194,17 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } -/* If the SKB of 'event' is on a list, it is the first such member - * of that list. - */ -int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *queue, *skb_list; - struct sk_buff *skb = sctp_event2skb(event); + struct sctp_ulpevent *event; + struct sk_buff_head *queue; + struct sk_buff *skb; int clear_pd = 0; - skb_list = (struct sk_buff_head *) skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); /* If the socket is just going to throw this away, do not * even try to deliver it. @@ -257,13 +257,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) } } - /* If we are harvesting multiple skbs they will be - * collected on a list. - */ - if (skb_list) - skb_queue_splice_tail_init(skb_list, queue); - else - __skb_queue_tail(queue, skb); + skb_queue_splice_tail_init(skb_list, queue); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -738,25 +732,25 @@ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) { struct sctp_ulpevent *event = NULL; - struct sk_buff_head temp; if (skb_queue_empty(&ulpq->reasm)) return; while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { - /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { - skb_queue_head_init(&temp); - __skb_queue_tail(&temp, sctp_event2skb(event)); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + /* Do ordering if needed. */ + if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); - } /* Send event to the ULP. 'event' is the * sctp_ulpevent for very first SKB on the temp' list. */ if (event) - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -956,7 +950,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { /* see if we have more ordered that we can deliver */ sctp_ulpq_retrieve_ordered(ulpq, event); - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -1082,7 +1076,11 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { - sctp_ulpq_tail_event(ulpq, event); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_ulpq_tail_event(ulpq, &temp); sctp_ulpq_set_pd(ulpq); return; } @@ -1106,7 +1104,8 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, freed += sctp_ulpq_renege_frags(ulpq, needed - freed); } /* If able to free enough room, accept this chunk. */ - if (freed >= needed) { + if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) && + freed >= needed) { int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 77ef53596d18..086d9913975d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -134,11 +134,9 @@ static int smc_release(struct socket *sock) smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ - if (smc->connect_info && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); flush_work(&smc->connect_work); - kfree(smc->connect_info); - smc->connect_info = NULL; if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -167,10 +165,9 @@ static int smc_release(struct socket *sock) if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { - mutex_lock(&smc->clcsock_release_lock); - sock_release(smc->clcsock); - smc->clcsock = NULL; - mutex_unlock(&smc->clcsock_release_lock); + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); @@ -446,12 +443,22 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } +static void smc_switch_to_fallback(struct smc_sock *smc) +{ + smc->use_fallback = true; + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; + } +} + /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; @@ -491,46 +498,41 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, mutex_unlock(&smc_client_lgr_pending); smc_conn_free(&smc->conn); + smc->connect_nonblock = 0; return reason_code; } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ -static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { - int reason_code = 0; - /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, - gid); - if (!(*ibdev)) - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - - return reason_code; + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ -static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ - smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); - if (!(*ismdev)) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev) + return SMC_CLC_DECL_NOSMCDDEV; return 0; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { - if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) - return SMC_CLC_DECL_CNFERR; + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; return 0; } @@ -538,12 +540,11 @@ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { if (!is_smcd) return 0; - if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } @@ -551,13 +552,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport, - u8 gid[], struct smcd_dev *ismdev) + struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); + rc = smc_clc_send_proposal(smc, smc_type, ini); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -568,23 +568,19 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; struct smc_link *link; int reason_code = 0; + ini->is_smcd = false; + ini->ib_lcl = &aclc->lcl; + ini->ib_clcqpn = ntoh24(aclc->qpn); + ini->srv_first_contact = aclc->hdr.flag; + mutex_lock(&smc_client_lgr_pending); - local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, ntoh24(aclc->qpn), &aclc->lcl, - NULL, 0); - if (local_contact < 0) { - if (local_contact == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (local_contact == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - else - reason_code = SMC_CLC_DECL_INTERR; /* other error */ + reason_code = smc_conn_create(smc, ini); + if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -594,45 +590,48 @@ static int smc_connect_rdma(struct smc_sock *smc, /* create send buffer and rmb */ if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); - if (local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - local_contact); + ini->cln_first_contact); smc_close_init(smc); smc_rx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - local_contact); + ini->cln_first_contact); } else { if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - local_contact); + ini->cln_first_contact); } smc_rmb_sync_sg_for_device(&smc->conn); reason_code = smc_clc_send_confirm(smc); if (reason_code) - return smc_connect_abort(smc, reason_code, local_contact); + return smc_connect_abort(smc, reason_code, + ini->cln_first_contact); smc_tx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, - local_contact); + ini->cln_first_contact); } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -642,23 +641,26 @@ static int smc_connect_rdma(struct smc_sock *smc, /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smcd_dev *ismdev) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; int rc = 0; + ini->is_smcd = true; + ini->ism_gid = aclc->gid; + ini->srv_first_contact = aclc->hdr.flag; + /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, - NULL, ismdev, aclc->gid); - if (local_contact < 0) { + rc = smc_conn_create(smc, ini); + if (rc) { mutex_unlock(&smc_server_lgr_pending); - return SMC_CLC_DECL_MEM; + return rc; } /* Create send and receive buffers */ if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); @@ -667,10 +669,11 @@ static int smc_connect_ism(struct smc_sock *smc, rc = smc_clc_send_confirm(smc); if (rc) - return smc_connect_abort(smc, rc, local_contact); + return smc_connect_abort(smc, rc, ini->cln_first_contact); mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -682,13 +685,9 @@ static int __smc_connect(struct smc_sock *smc) { bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; - struct smc_ib_device *ibdev; - struct smcd_dev *ismdev; - u8 gid[SMC_GID_SIZE]; - unsigned short vlan; + struct smc_init_info ini = {0}; int smc_type; int rc = 0; - u8 ibport; sock_hold(&smc->sk); /* sock put in passive closing */ @@ -703,20 +702,21 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check for VLAN ID */ - if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, &ini)) + return smc_connect_decline_fallback(smc, + SMC_CLC_DECL_GETVLANERR); /* check if there is an ism device available */ - if (!smc_check_ism(smc, &ismdev) && - !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + if (!smc_find_ism_device(smc, &ini) && + !smc_connect_ism_vlan_setup(smc, &ini)) { /* ISM is supported for this connection */ ism_supported = true; smc_type = SMC_TYPE_D; } /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { + if (!smc_find_rdma_device(smc, &ini)) { /* RDMA is supported for this connection */ rdma_supported = true; if (ism_supported) @@ -730,25 +730,25 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); + rc = smc_connect_clc(smc, smc_type, &aclc, &ini); if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } /* depending on previous steps, connect using rdma or ism */ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + rc = smc_connect_rdma(smc, &aclc, &ini); else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, ismdev); + rc = smc_connect_ism(smc, &aclc, &ini); else rc = SMC_CLC_DECL_MODEUNSUPP; if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return 0; } @@ -756,17 +756,30 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - int rc; + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; - lock_sock(&smc->sk); - rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, - smc->connect_info->alen, smc->connect_info->flags); + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; - goto out; - } - if (rc < 0) { - smc->sk.sk_err = -rc; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCP_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; + } + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); goto out; } @@ -775,12 +788,14 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - if (smc->sk.sk_err) - smc->sk.sk_state_change(&smc->sk); - else - smc->sk.sk_write_space(&smc->sk); - kfree(smc->connect_info); - smc->connect_info = NULL; + if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (smc->sk.sk_err) { + smc->sk.sk_state_change(&smc->sk); + } else { /* allow polling before and after fallback decision */ + smc->clcsock->sk->sk_write_space(smc->clcsock->sk); + smc->sk.sk_write_space(&smc->sk); + } + } release_sock(&smc->sk); } @@ -813,26 +828,18 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; if (flags & O_NONBLOCK) { - if (smc->connect_info) { - rc = -EALREADY; - goto out; - } - smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); - if (!smc->connect_info) { - rc = -ENOMEM; - goto out; - } - smc->connect_info->alen = alen; - smc->connect_info->flags = flags ^ O_NONBLOCK; - memcpy(&smc->connect_info->addr, addr, alen); - schedule_work(&smc->connect_work); + if (schedule_work(&smc->connect_work)) + smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; - rc = __smc_connect(smc); if (rc < 0) goto out; @@ -872,11 +879,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (rc < 0) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -927,16 +934,21 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ continue; } - if (new_sock) + if (new_sock) { sock_graft(new_sk, new_sock); + if (isk->use_fallback) { + smc_sk(new_sk)->clcsock->file = new_sock->file; + isk->clcsock->file->private_data = isk->clcsock; + } + } return new_sk; } return NULL; @@ -956,6 +968,7 @@ void smc_close_non_accepted(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } + sk->sk_prot->unhash(sk); if (smc->clcsock) { struct socket *tcp; @@ -971,7 +984,6 @@ void smc_close_non_accepted(struct sock *sk) smc_conn_free(&smc->conn); } release_sock(sk); - sk->sk_prot->unhash(sk); sock_put(sk); /* final sock_put */ } @@ -1037,13 +1049,13 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); + release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } - release_sock(&lsmc->sk); /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); @@ -1087,7 +1099,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, return; } smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code) < 0) { @@ -1099,7 +1111,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } /* listen worker: check prefixes */ -static int smc_listen_rdma_check(struct smc_sock *new_smc, +static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; @@ -1107,25 +1119,21 @@ static int smc_listen_rdma_check(struct smc_sock *new_smc, pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_ib_device *ibdev, u8 ibport, - int *local_contact) + struct smc_init_info *ini) { + int rc; + /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, - &pclc->lcl, NULL, 0); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) @@ -1137,33 +1145,30 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, - struct smcd_dev *ismdev, - int *local_contact) + struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd; + int rc; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, - ismdev, pclc_smcd->gid); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + ini->ism_gid = pclc_smcd->gid; + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* Check if peer can be reached via ISM device */ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; @@ -1227,15 +1232,13 @@ static void smc_listen_work(struct work_struct *work) struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *ibdev; + struct smc_init_info ini = {0}; bool ism_supported = false; - struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; - int local_contact = 0; - unsigned short vlan; - int reason_code = 0; int rc = 0; - u8 ibport; + + if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) + return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); @@ -1244,7 +1247,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; @@ -1254,17 +1257,26 @@ static void smc_listen_work(struct work_struct *work) * wait for and receive SMC Proposal CLC message */ pclc = (struct smc_clc_msg_proposal *)&buf; - reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL, CLC_WAIT_TIME); - if (reason_code) { - smc_listen_decline(new_smc, reason_code, 0); - return; - } + rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(new_smc)) { - smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); - return; + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; + } + + /* check for matching IP prefix and subnet length */ + rc = smc_listen_prfx_check(new_smc, pclc); + if (rc) + goto out_decl; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) { + rc = SMC_CLC_DECL_GETVLANERR; + goto out_decl; } mutex_lock(&smc_server_lgr_pending); @@ -1273,59 +1285,73 @@ static void smc_listen_work(struct work_struct *work) smc_tx_init(new_smc); /* check if ISM is available */ - if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && - !smc_check_ism(new_smc, &ismdev) && - !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { - ism_supported = true; + if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { + ini.is_smcd = true; /* prepare ISM check */ + rc = smc_find_ism_device(new_smc, &ini); + if (!rc) + rc = smc_listen_ism_init(new_smc, pclc, &ini); + if (!rc) + ism_supported = true; + else if (pclc->hdr.path == SMC_TYPE_D) + goto out_unlock; /* skip RDMA and decline */ } /* check if RDMA is available */ - if (!ism_supported && - ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || - smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact))) { - /* SMC not supported, decline */ - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, - local_contact); - return; + if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ + /* prepare RDMA check */ + memset(&ini, 0, sizeof(ini)); + ini.is_smcd = false; + ini.ib_lcl = &pclc->lcl; + rc = smc_find_rdma_device(new_smc, &ini); + if (rc) { + /* no RDMA device found */ + if (pclc->hdr.path == SMC_TYPE_B) + /* neither ISM nor RDMA device found */ + rc = SMC_CLC_DECL_NOSMCDEV; + goto out_unlock; + } + rc = smc_listen_rdma_init(new_smc, &ini); + if (rc) + goto out_unlock; + rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; } /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) { - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, rc, local_contact); - return; - } + rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; /* SMC-D does not need this lock any more */ if (ism_supported) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM, CLC_WAIT_TIME); - if (reason_code) { + rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { if (!ism_supported) - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); - return; + goto out_unlock; + goto out_decl; } /* finish worker */ if (!ism_supported) { - rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact); + rc = smc_listen_rdma_finish(new_smc, &cclc, + ini.cln_first_contact); mutex_unlock(&smc_server_lgr_pending); if (rc) return; } smc_conn_save_peer_info(new_smc, &cclc); smc_listen_out_connected(new_smc); + return; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini.cln_first_contact); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1501,7 +1527,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; @@ -1571,8 +1597,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; struct smc_sock *smc; + __poll_t mask = 0; if (!sk) return EPOLLNVAL; @@ -1582,8 +1608,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) - mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); @@ -1594,9 +1618,14 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ - mask = smc_accept_poll(sk); + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; } else { - if (atomic_read(&smc->conn.sndbuf_space) || + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { @@ -1703,7 +1732,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { if (!smc->use_fallback) diff --git a/net/smc/smc.h b/net/smc/smc.h index adbdf195eb08..878313f8d6c1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -190,18 +190,11 @@ struct smc_connection { u64 peer_token; /* SMC-D token of peer */ }; -struct smc_connect_info { - int flags; - int alen; - struct sockaddr addr; -}; - struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - struct smc_connect_info *connect_info; /* connect address & flags */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -219,6 +212,10 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index d53fd588d1f5..745afd82f281 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -385,8 +385,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *ibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev) + struct smc_init_info *ini) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; @@ -416,8 +415,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, /* add SMC-R specifics */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + ETH_ALEN); pclc.iparea_offset = htons(0); } if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { @@ -425,7 +425,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, memset(&pclc_smcd, 0, sizeof(pclc_smcd)); plen += sizeof(pclc_smcd); pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ismdev->local_gid; + pclc_smcd.gid = ini->ism_dev->local_gid; } pclc.hdr.length = htons(plen); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 24658e8c0de4..ca209272e5fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -34,16 +34,22 @@ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ #define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ #define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ -#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ +#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ +#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ +#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ +#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ +#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ -#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ -#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ +#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ +#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -179,6 +185,7 @@ smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) } struct smcd_dev; +struct smc_init_info; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); @@ -186,8 +193,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev); + struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 2ad37e998509..fc06720b53c1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -21,6 +21,22 @@ #define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) +/* release the clcsock that is assigned to the smc_sock */ +void smc_clcsock_release(struct smc_sock *smc) +{ + struct socket *tcp; + + if (smc->listen_smc && current_work() != &smc->smc_listen_work) + cancel_work_sync(&smc->smc_listen_work); + mutex_lock(&smc->clcsock_release_lock); + if (smc->clcsock) { + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + mutex_unlock(&smc->clcsock_release_lock); +} + static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; @@ -321,6 +337,7 @@ static void smc_close_passive_work(struct work_struct *work) close_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_cdc_conn_state_flags *rxflags; + bool release_clcsock = false; struct sock *sk = &smc->sk; int old_state; @@ -400,13 +417,13 @@ wakeup: if ((sk->sk_state == SMC_CLOSED) && (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); - if (smc->clcsock) { - sock_release(smc->clcsock); - smc->clcsock = NULL; - } + if (smc->clcsock) + release_clcsock = true; } } release_sock(sk); + if (release_clcsock) + smc_clcsock_release(smc); sock_put(sk); /* sock_hold done by schedulers of close_work */ } diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index 19eb6a211c23..e0e3b5df25d2 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -23,5 +23,6 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); +void smc_clcsock_release(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 53a17cfa61af..2d2850adc2a3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -195,10 +195,7 @@ static void smc_lgr_free_work(struct work_struct *work) } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, - struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id, - struct smcd_dev *smcismdev, u64 peer_gid) +static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -206,20 +203,21 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, int rc = 0; int i; - if (is_smcd && vlan_id) { - rc = smc_ism_get_vlan(smcismdev, vlan_id); - if (rc) + if (ini->is_smcd && ini->vlan_id) { + if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { + rc = SMC_CLC_DECL_ISMVLANERR; goto out; + } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { - rc = -ENOMEM; + rc = SMC_CLC_DECL_MEM; goto out; } - lgr->is_smcd = is_smcd; + lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; - lgr->vlan_id = vlan_id; + lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -231,29 +229,32 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - if (is_smcd) { + if (ini->is_smcd) { /* SMC-D specific settings */ - lgr->peer_gid = peer_gid; - lgr->smcd = smcismdev; + lgr->peer_gid = ini->ism_gid; + lgr->smcd = ini->ism_dev; } else { /* SMC-R specific settings */ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = + ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) + smc_ib_setup_per_ibdev(ini->ib_dev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, + &lnk->sgid_index); if (rc) goto free_lgr; rc = smc_llc_link_init(lnk); @@ -289,6 +290,12 @@ clear_llc_lnk: free_lgr: kfree(lgr); out: + if (rc < 0) { + if (rc == -ENOMEM) + rc = SMC_CLC_DECL_MEM; + else + rc = SMC_CLC_DECL_INTERR; + } return rc; } @@ -528,13 +535,13 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; int i, nest_lvl, rc = 0; - *vlan_id = 0; + ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; @@ -546,7 +553,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) ndev = dst->dev; if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } @@ -560,7 +567,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) lower = lower->next; ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); break; } } @@ -594,24 +601,16 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid) +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; - unsigned short vlan_id; enum smc_lgr_role role; int rc = 0; + ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); - if (rc) - return rc; - - if ((role == SMC_CLNT) && srv_first_contact) + if (role == SMC_CLNT && ini->srv_first_contact) /* create new link group as well */ goto create; @@ -619,14 +618,15 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role, clcqpn)) && + if ((ini->is_smcd ? + smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : + smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == vlan_id && + lgr->vlan_id == ini->vlan_id && (role == SMC_CLNT || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ - local_contact = SMC_REUSE_CONTACT; + ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ if (delayed_work_pending(&lgr->free_work)) @@ -638,19 +638,18 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, } spin_unlock_bh(&smc_lgr_list.lock); - if (role == SMC_CLNT && !srv_first_contact && - (local_contact == SMC_FIRST_CONTACT)) { + if (role == SMC_CLNT && !ini->srv_first_contact && + ini->cln_first_contact == SMC_FIRST_CONTACT) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ - return -ENOLINK; + return SMC_CLC_DECL_SYNCERR; } create: - if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, - lcl->id_for_peer, vlan_id, smcd, peer_gid); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, ini); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ @@ -658,7 +657,7 @@ create: conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - if (is_smcd) { + if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } @@ -667,7 +666,7 @@ create: #endif out: - return rc ? rc : local_contact; + return rc; } /* convert the RMB size into the compressed notation - minimum 16K. diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8806d2afa6ed..c00ac61dc129 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -229,6 +229,24 @@ struct smc_link_group { }; }; +struct smc_clc_msg_local; + +struct smc_init_info { + u8 is_smcd; + unsigned short vlan_id; + int srv_first_contact; + int cln_first_contact; + /* SMC-R */ + struct smc_clc_msg_local *ib_lcl; + struct smc_ib_device *ib_dev; + u8 ib_gid[SMC_GID_SIZE]; + u8 ib_port; + u32 ib_clcqpn; + /* SMC-D */ + u64 ism_gid; + struct smcd_dev *ism_dev; +}; + /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock @@ -281,13 +299,10 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid); +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); void smc_core_exit(void); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 2fff79db1a59..e89e918b88e0 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -289,6 +289,11 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->vlan); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); + if (!smcd->event_wq) { + kfree(smcd->conn); + kfree(smcd); + return NULL; + } return smcd; } EXPORT_SYMBOL_GPL(smcd_alloc_dev); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 3cdf81cf97a3..9f5d8f36f2d7 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -26,6 +26,7 @@ #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_core.h" #define SMC_ASCII_BLANK 32 @@ -603,7 +604,8 @@ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - return smc_pnet_remove_by_pnetid(net, NULL); + smc_pnet_remove_by_pnetid(net, NULL); + return 0; } /* SMC_PNETID generic netlink operation definition */ @@ -755,8 +757,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) + struct smc_init_info *ini) { struct smc_ib_device *ibdev; @@ -776,10 +777,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; break; } } @@ -794,9 +795,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, - u8 gid[]) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smc_ib_device *ibdev; @@ -806,7 +805,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } @@ -817,10 +816,10 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; goto out; } } @@ -830,7 +829,7 @@ out: } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, - struct smcd_dev **smcismdev) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; @@ -844,7 +843,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { - *smcismdev = ismdev; + ini->ism_dev = ismdev; break; } } @@ -855,21 +854,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]) +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcibdev = NULL; - *ibport = 0; - + ini->ib_dev = NULL; + ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); @@ -877,17 +873,17 @@ out: return; } -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcismdev = NULL; + ini->ism_dev = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5eac42fb45d0..4564e4d69c2e 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -18,6 +18,7 @@ struct smc_ib_device; struct smcd_dev; +struct smc_init_info; /** * struct smc_pnettable - SMC PNET table anchor @@ -43,9 +44,7 @@ int smc_pnet_init(void) __init; int smc_pnet_net_init(struct net *net); void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]); -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); #endif diff --git a/net/socket.c b/net/socket.c index 8255f5bda0aa..a180e1a9ff23 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1164,6 +1164,26 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: + if (!sock->ops->gettstamp) { + err = -ENOIOCTLCMD; + break; + } + err = sock->ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); + break; + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: + if (!sock->ops->gettstamp) { + err = -ENOIOCTLCMD; + break; + } + err = sock->ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); + break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -2916,38 +2936,6 @@ void socket_seq_show(struct seq_file *seq) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_COMPAT -static int do_siocgstamp(struct net *net, struct socket *sock, - unsigned int cmd, void __user *up) -{ - mm_segment_t old_fs = get_fs(); - struct timeval ktv; - int err; - - set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); - set_fs(old_fs); - if (!err) - err = compat_put_timeval(&ktv, up); - - return err; -} - -static int do_siocgstampns(struct net *net, struct socket *sock, - unsigned int cmd, void __user *up) -{ - mm_segment_t old_fs = get_fs(); - struct timespec kts; - int err; - - set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); - set_fs(old_fs); - if (!err) - err = compat_put_timespec(&kts, up); - - return err; -} - static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) { struct compat_ifconf ifc32; @@ -3347,10 +3335,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDRT: case SIOCDELRT: return routing_ioctl(net, sock, cmd, argp); - case SIOCGSTAMP: - return do_siocgstamp(net, sock, cmd, argp); - case SIOCGSTAMPNS: - return do_siocgstampns(net, sock, cmd, argp); + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: + if (!sock->ops->gettstamp) + return -ENOIOCTLCMD; + return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, + !COMPAT_USE_64BIT_TIME); + case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: @@ -3368,6 +3359,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDDLCI: case SIOCDELDLCI: case SIOCGSKNS: + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 860dcfb95ee4..e137698e8aef 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -14,7 +14,8 @@ #include <linux/file.h> #include <linux/in.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> @@ -140,13 +141,11 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ - if (skb_has_frag_list(head)) { - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - return 0; - } + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + return 0; } if (unlikely(skb_shinfo(head)->frag_list)) { @@ -299,7 +298,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, break; } - /* Positive extra indicates ore bytes than needed for the + /* Positive extra indicates more bytes than needed for the * message */ @@ -547,7 +546,7 @@ void strp_check_rcv(struct strparser *strp) } EXPORT_SYMBOL_GPL(strp_check_rcv); -static int __init strp_mod_init(void) +static int __init strp_dev_init(void) { strp_wq = create_singlethread_workqueue("kstrp"); if (unlikely(!strp_wq)) @@ -555,11 +554,4 @@ static int __init strp_mod_init(void) return 0; } - -static void __exit strp_mod_exit(void) -{ - destroy_workqueue(strp_wq); -} -module_init(strp_mod_init); -module_exit(strp_mod_exit); -MODULE_LICENSE("GPL"); +device_initcall(strp_dev_init); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 12bb23b8e0c5..261131dfa1f1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,6 +54,7 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } +static inline int cache_is_valid(struct cache_head *h); static void cache_fresh_locked(struct cache_head *head, time_t expiry, struct cache_detail *detail); static void cache_fresh_unlocked(struct cache_head *head, @@ -105,6 +106,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, if (cache_is_expired(detail, tmp)) { hlist_del_init_rcu(&tmp->cache_list); detail->entries --; + if (cache_is_valid(tmp) == -EAGAIN) + set_bit(CACHE_NEGATIVE, &tmp->flags); cache_fresh_locked(tmp, 0, detail); freeme = tmp; break; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 187d10443a15..8ff11dc98d7f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1540,7 +1540,6 @@ call_start(struct rpc_task *task) clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; rpc_task_set_transport(task, clnt); - call_reserve(task); } /* @@ -1554,9 +1553,6 @@ call_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } static void call_retry_reserve(struct rpc_task *task); @@ -1579,7 +1575,6 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; - call_refresh(task); return; } @@ -1605,7 +1600,6 @@ call_reserveresult(struct rpc_task *task) /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; - call_retry_reserve(task); return; case -EIO: /* probably a shutdown */ break; @@ -1628,9 +1622,6 @@ call_retry_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } /* @@ -1645,9 +1636,6 @@ call_refresh(struct rpc_task *task) task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; rpcauth_refreshcred(task); - if (rpc_task_need_resched(task)) - return; - call_refreshresult(task); } /* @@ -1666,7 +1654,6 @@ call_refreshresult(struct rpc_task *task) case 0: if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - call_allocate(task); return; } /* Use rate-limiting and a max number of retries if refresh @@ -1685,7 +1672,6 @@ call_refreshresult(struct rpc_task *task) task->tk_cred_retry--; dprintk("RPC: %5u %s: retry refresh creds\n", task->tk_pid, __func__); - call_refresh(task); return; } dprintk("RPC: %5u %s: refresh creds failed with error %d\n", @@ -1711,10 +1697,8 @@ call_allocate(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_encode; - if (req->rq_buffer) { - call_encode(task); + if (req->rq_buffer) return; - } if (proc->p_proc != 0) { BUG_ON(proc->p_arglen == 0); @@ -1740,12 +1724,8 @@ call_allocate(struct rpc_task *task) status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); - if (status == 0) { - if (rpc_task_need_resched(task)) - return; - call_encode(task); + if (status == 0) return; - } if (status != -ENOMEM) { rpc_exit(task, status); return; @@ -1828,8 +1808,12 @@ call_encode(struct rpc_task *task) xprt_request_enqueue_receive(task); xprt_request_enqueue_transmit(task); out: - task->tk_action = call_bind; - call_bind(task); + task->tk_action = call_transmit; + /* Check that the connection is OK */ + if (!xprt_bound(task->tk_xprt)) + task->tk_action = call_bind; + else if (!xprt_connected(task->tk_xprt)) + task->tk_action = call_connect; } /* @@ -1847,7 +1831,6 @@ rpc_task_handle_transmitted(struct rpc_task *task) { xprt_end_transmit(task); task->tk_action = call_transmit_status; - call_transmit_status(task); } /* @@ -1865,7 +1848,6 @@ call_bind(struct rpc_task *task) if (xprt_bound(xprt)) { task->tk_action = call_connect; - call_connect(task); return; } @@ -1896,7 +1878,6 @@ call_bind_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; task->tk_action = call_connect; - call_connect(task); return; } @@ -1981,7 +1962,6 @@ call_connect(struct rpc_task *task) if (xprt_connected(xprt)) { task->tk_action = call_transmit; - call_transmit(task); return; } @@ -2051,7 +2031,6 @@ call_connect_status(struct rpc_task *task) case 0: clnt->cl_stats->netreconn++; task->tk_action = call_transmit; - call_transmit(task); return; } rpc_exit(task, status); @@ -2087,9 +2066,6 @@ call_transmit(struct rpc_task *task) xprt_transmit(task); } xprt_end_transmit(task); - if (rpc_task_need_resched(task)) - return; - call_transmit_status(task); } /* @@ -2105,11 +2081,8 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (rpc_task_transmitted(task)) { - if (task->tk_status == 0) - xprt_request_wait_receive(task); - if (rpc_task_need_resched(task)) - return; - call_status(task); + task->tk_status = 0; + xprt_request_wait_receive(task); return; } @@ -2170,7 +2143,6 @@ call_bc_encode(struct rpc_task *task) { xprt_request_enqueue_transmit(task); task->tk_action = call_bc_transmit; - call_bc_transmit(task); } /* @@ -2195,6 +2167,9 @@ call_bc_transmit_status(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_transmitted(task)) + task->tk_status = 0; + dprint_status(task); switch (task->tk_status) { @@ -2261,7 +2236,6 @@ call_status(struct rpc_task *task) status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; - call_decode(task); return; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 89a63391d4d4..30cfc0efe699 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -90,7 +90,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_qp(ia->ri_id->qp); + ib_drain_rq(ia->ri_id->qp); drain_workqueue(buf->rb_completion_wq); /* Deferred Reply processing might have scheduled diff --git a/net/tipc/link.c b/net/tipc/link.c index 3cb9f326ee6f..6053489c8063 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -876,6 +876,8 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_head_init(&list); l->in_session = false; + /* Force re-synch of peer session number before establishing */ + l->peer_session--; l->session++; l->mtu = l->advertised_mtu; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bff241f03525..89993afe0fbd 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -909,7 +909,8 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; - if (*last_type) { + if (*last_type || + (!i && *last_key && (*last_lower == *last_key))) { service = tipc_service_find(net, *last_type); if (!service) return -EPIPE; diff --git a/net/tipc/node.c b/net/tipc/node.c index 3469b5d4ed32..7478e2d4ec02 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -375,14 +375,15 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ - write_lock_bh(&n->lock); + tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } - write_unlock_bh(&n->lock); + tipc_node_write_unlock_fast(n); + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8ac8ddf1e324..1385207a301f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3070,6 +3070,9 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_SOCK_RECVQ_USED: + value = sk_rmem_alloc_get(sk); + break; case TIPC_GROUP_JOIN: seq.type = 0; if (tsk->group) diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 3481e4906bd6..9df82a573aa7 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -38,6 +38,8 @@ #include <linux/sysctl.h> +static int zero; +static int one = 1; static struct ctl_table_header *tipc_ctl_hdr; static struct ctl_table tipc_table[] = { @@ -46,14 +48,16 @@ static struct ctl_table tipc_table[] = { .data = &sysctl_tipc_rmem, .maxlen = sizeof(sysctl_tipc_rmem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "named_timeout", .data = &sysctl_tipc_named_timeout, .maxlen = sizeof(sysctl_tipc_named_timeout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "sk_filter", diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 6f166fbbfff1..0884a1b8ad12 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -354,25 +354,21 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) skb_pull(skb, sizeof(struct udphdr)); hdr = buf_msg(skb); - rcu_read_lock(); - b = rcu_dereference_rtnl(ub->bearer); + b = rcu_dereference(ub->bearer); if (!b) - goto rcu_out; + goto out; if (b && test_bit(0, &b->up)) { tipc_rcv(sock_net(sk), skb, b); - rcu_read_unlock(); return 0; } if (unlikely(msg_user(hdr) == LINK_CONFIG)) { err = tipc_udp_rcast_disc(b, skb); if (err) - goto rcu_out; + goto out; } -rcu_out: - rcu_read_unlock(); out: kfree_skb(skb); return 0; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 135a7ee9db03..cc0256939eb6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,8 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - if (ctx->tx_conf == TLS_HW) + if (ctx->tx_conf == TLS_HW) { kfree(tls_offload_ctx_tx(ctx)); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + } if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); @@ -216,6 +219,13 @@ void tls_device_sk_destruct(struct sock *sk) } EXPORT_SYMBOL(tls_device_sk_destruct); +void tls_device_free_resources_tx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + tls_free_partial_record(sk, tls_ctx); +} + static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) @@ -894,7 +904,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_netdev; free_sw_resources: + up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); + down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_netdev: @@ -929,8 +941,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) } out: up_read(&device_offload_lock); - kfree(tls_ctx->rx.rec_seq); - kfree(tls_ctx->rx.iv); tls_sw_release_resources_rx(sk); } diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 54c3a758f2a7..a3ebd4b02714 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -194,6 +194,9 @@ static void update_chksum(struct sk_buff *skb, int headln) static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) { + struct sock *sk = skb->sk; + int delta; + skb_copy_header(nskb, skb); skb_put(nskb, skb->len); @@ -201,11 +204,15 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) update_chksum(nskb, headln); nskb->destructor = skb->destructor; - nskb->sk = skb->sk; + nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; - refcount_add(nskb->truesize - skb->truesize, - &nskb->sk->sk_wmem_alloc); + + delta = nskb->truesize - skb->truesize; + if (likely(delta < 0)) + WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); + else if (delta) + refcount_add(delta, &sk->sk_wmem_alloc); } /* This function may be called after the user socket is already diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0e24edab2535..fc81ae18cc44 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -208,6 +208,26 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } +bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx) +{ + struct scatterlist *sg; + + sg = ctx->partially_sent_record; + if (!sg) + return false; + + while (1) { + put_page(sg_page(sg)); + sk_mem_uncharge(sk, sg->length); + + if (sg_is_last(sg)) + break; + sg++; + } + ctx->partially_sent_record = NULL; + return true; +} + static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); @@ -267,13 +287,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_free_resources_tx(sk); +#ifdef CONFIG_TLS_DEVICE + } else if (ctx->tx_conf == TLS_HW) { + tls_device_free_resources_tx(sk); +#endif } - if (ctx->rx_conf == TLS_SW) { - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + if (ctx->rx_conf == TLS_SW) tls_sw_free_resources_rx(sk); - } #ifdef CONFIG_TLS_DEVICE if (ctx->rx_conf == TLS_HW) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4741edf4bb1e..c02293fb10e6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2065,20 +2065,7 @@ void tls_sw_free_resources_tx(struct sock *sk) /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ - if (tls_ctx->partially_sent_record) { - struct scatterlist *sg = tls_ctx->partially_sent_record; - - while (1) { - put_page(sg_page(sg)); - sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; - } - - tls_ctx->partially_sent_record = NULL; - + if (tls_free_partial_record(sk, tls_ctx)) { rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); @@ -2104,6 +2091,9 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + if (ctx->aead_recv) { kfree_skb(ctx->recv_pkt); ctx->recv_pkt = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ddb838a1b74c..e68d7454f2e3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2040,8 +2040,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; + int skip; int err; - int peeked, skip; err = -EOPNOTSUPP; if (flags&MSG_OOB) @@ -2053,8 +2053,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip, - &err, &last); + skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, + &last); if (skb) break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3aecdd3d5b07..e74d21f4108a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13800,7 +13800,8 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_associate, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEAUTHENTICATE, @@ -13845,14 +13846,16 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_connect, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS, .doit = nl80211_update_connect_params, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DISCONNECT, @@ -13877,7 +13880,8 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMKSA, @@ -14185,7 +14189,8 @@ static const struct genl_ops nl80211_ops[] = { .dumpit = nl80211_vendor_cmd_dump, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_SET_QOS_MAP, @@ -14233,7 +14238,8 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_PMK, .doit = nl80211_set_pmk, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMK, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0d5b11d7c6ed..816425ffe05a 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1303,6 +1303,16 @@ reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1, return dfs_region1; } +static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1, + const struct ieee80211_wmm_ac *wmm_ac2, + struct ieee80211_wmm_ac *intersect) +{ + intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min); + intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max); + intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot); + intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn); +} + /* * Helper for regdom_intersect(), this does the real * mathematical intersection fun @@ -1317,6 +1327,8 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule1, *power_rule2; struct ieee80211_power_rule *power_rule; + const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2; + struct ieee80211_wmm_rule *wmm_rule; u32 freq_diff, max_bandwidth1, max_bandwidth2; freq_range1 = &rule1->freq_range; @@ -1327,6 +1339,10 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, power_rule2 = &rule2->power_rule; power_rule = &intersected_rule->power_rule; + wmm_rule1 = &rule1->wmm_rule; + wmm_rule2 = &rule2->wmm_rule; + wmm_rule = &intersected_rule->wmm_rule; + freq_range->start_freq_khz = max(freq_range1->start_freq_khz, freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, @@ -1370,6 +1386,29 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms, rule2->dfs_cac_ms); + if (rule1->has_wmm && rule2->has_wmm) { + u8 ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + reg_wmm_rules_intersect(&wmm_rule1->client[ac], + &wmm_rule2->client[ac], + &wmm_rule->client[ac]); + reg_wmm_rules_intersect(&wmm_rule1->ap[ac], + &wmm_rule2->ap[ac], + &wmm_rule->ap[ac]); + } + + intersected_rule->has_wmm = true; + } else if (rule1->has_wmm) { + *wmm_rule = *wmm_rule1; + intersected_rule->has_wmm = true; + } else if (rule2->has_wmm) { + *wmm_rule = *wmm_rule2; + intersected_rule->has_wmm = true; + } else { + intersected_rule->has_wmm = false; + } + if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 85dd3342d2c4..c04f5451f89b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -241,10 +241,9 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, /* copy subelement as we need to change its content to * mark an ie after it is processed. */ - sub_copy = kmalloc(subie_len, gfp); + sub_copy = kmemdup(subelement, subie_len, gfp); if (!sub_copy) return 0; - memcpy(sub_copy, subelement, subie_len); pos = &new_ie[0]; diff --git a/net/wireless/util.c b/net/wireless/util.c index 6c02c9cf7aa9..cf63b635afc0 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1229,9 +1229,11 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; - else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", - rate->bw, rate->he_ru_alloc)) + else { + WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc); return 0; + } /* now scale to the appropriate MCS */ tmp = result; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 20a511398389..0ea48a52ce79 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1398,18 +1398,6 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } - case SIOCGSTAMP: - rc = -EINVAL; - if (sk) - rc = sock_get_timestamp(sk, - (struct timeval __user *)argp); - break; - case SIOCGSTAMPNS: - rc = -EINVAL; - if (sk) - rc = sock_get_timestampns(sk, - (struct timespec __user *)argp); - break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1681,8 +1669,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); - struct sock *sk = sock->sk; - int rc = -ENOIOCTLCMD; switch(cmd) { @@ -1690,18 +1676,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; - case SIOCGSTAMP: - rc = -EINVAL; - if (sk) - rc = compat_sock_get_timestamp(sk, - (struct timeval __user*)argp); - break; - case SIOCGSTAMPNS: - rc = -EINVAL; - if (sk) - rc = compat_sock_get_timestampns(sk, - (struct timespec __user*)argp); - break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1765,6 +1739,7 @@ static const struct proto_ops x25_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 610c0bdc0c2b..88b9ae24658d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -43,6 +43,48 @@ struct xsk_queue { u64 invalid_descs; }; +/* The structure of the shared state of the rings are the same as the + * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion + * ring, the kernel is the producer and user space is the consumer. For + * the Tx and fill rings, the kernel is the consumer and user space is + * the producer. + * + * producer consumer + * + * if (LOAD ->consumer) { LOAD ->producer + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->producer STORE ->consumer + * } + * + * (A) pairs with (D), and (B) pairs with (C). + * + * Starting with (B), it protects the data from being written after + * the producer pointer. If this barrier was missing, the consumer + * could observe the producer pointer being set and thus load the data + * before the producer has written the new data. The consumer would in + * this case load the old data. + * + * (C) protects the consumer from speculatively loading the data before + * the producer pointer actually has been read. If we do not have this + * barrier, some architectures could load old data as speculative loads + * are not discarded as the CPU does not know there is a dependency + * between ->producer and data. + * + * (A) is a control dependency that separates the load of ->consumer + * from the stores of $data. In case ->consumer indicates there is no + * room in the buffer to store $data we do not. So no barrier is needed. + * + * (D) protects the load of the data to be observed to happen after the + * store of the consumer pointer. If we did not have this memory + * barrier, the producer could observe the consumer pointer being set + * and overwrite the data with a new value before the consumer got the + * chance to read the old value. The consumer would thus miss reading + * the old entry and very likely read the new entry twice, once right + * now and again after circling through the ring. + */ + /* Common functions operating for both RXTX and umem queues */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) @@ -106,6 +148,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) { if (q->cons_tail == q->cons_head) { + smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); @@ -128,10 +171,11 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; + /* A, matches D */ ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ WRITE_ONCE(q->ring->producer, q->prod_tail); return 0; @@ -144,6 +188,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) return -ENOSPC; + /* A, matches D */ ring->desc[q->prod_head++ & q->ring_mask] = addr; return 0; } @@ -152,7 +197,7 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, u32 nb_entries) { /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ q->prod_tail += nb_entries; WRITE_ONCE(q->ring->producer, q->prod_tail); @@ -163,6 +208,7 @@ static inline int xskq_reserve_addr(struct xsk_queue *q) if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; + /* A, matches D */ q->prod_head++; return 0; } @@ -204,11 +250,12 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, struct xdp_desc *desc) { if (q->cons_tail == q->cons_head) { + smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ - smp_rmb(); + smp_rmb(); /* C, matches B */ } return xskq_validate_desc(q, desc); @@ -228,6 +275,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; + /* A, matches D */ idx = (q->prod_head++) & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; @@ -238,7 +286,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, static inline void xskq_produce_flush_desc(struct xsk_queue *q) { /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ q->prod_tail = q->prod_head, WRITE_ONCE(q->ring->producer, q->prod_tail); |