From aea7427f70cce5fa8f99ce447b213e9e3b49f24c Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Thu, 19 Jun 2008 16:29:39 -0700 Subject: ipv6: Remove options header when setsockopt's optlen is 0 Remove the sticky Hop-by-Hop options header by calling setsockopt() for IPV6_HOPOPTS with a zero option length, per RFC3542. Routing header and Destination options header does the same as Hop-by-Hop options header. Signed-off-by: Shan Wei Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c042ce19bd14..86e28a75267f 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -345,18 +345,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ if (optlen == 0) optval = NULL; + else if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || optlen > 8 * 255) + goto e_inval; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) break; - if (optlen < sizeof(struct ipv6_opt_hdr) || - optlen & 0x7 || optlen > 8 * 255) - goto e_inval; - opt = ipv6_renew_options(sk, np->opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); -- cgit v1.2.3 From f630e43a215a3129d0c1173cae0bce6ea4855cf7 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 19 Jun 2008 16:33:57 -0700 Subject: ipv6: Drop packets for loopback address from outside of the box. [ Based upon original report and patch by Karsten Keil. Karsten has verified that this fixes the TAHI test case "ICMPv6 test v6LC.5.1.2 Part F". -DaveM ] Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 6 ++++++ net/ipv6/ip6_input.c | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e0a612bc9c4e..f422f7218e1c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -367,6 +367,12 @@ static inline int ipv6_addr_any(const struct in6_addr *a) a->s6_addr32[2] | a->s6_addr32[3] ) == 0); } +static inline int ipv6_addr_loopback(const struct in6_addr *a) +{ + return ((a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0); +} + static inline int ipv6_addr_v4mapped(const struct in6_addr *a) { return ((a->s6_addr32[0] | a->s6_addr32[1] | diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 4e5c8615832c..17eb48b8e329 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -102,6 +102,15 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + /* + * RFC4291 2.5.3 + * A packet received on an interface with a destination address + * of loopback must be dropped. + */ + if (!(dev->flags & IFF_LOOPBACK) && + ipv6_addr_loopback(&hdr->daddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); -- cgit v1.2.3 From 735ce972fbc8a65fb17788debd7bbe7b4383cc62 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 20 Jun 2008 22:04:34 -0700 Subject: sctp: Make sure N * sizeof(union sctp_addr) does not overflow. As noticed by Gabriel Campana, the kmalloc() length arg passed in by sctp_getsockopt_local_addrs_old() can overflow if ->addr_num is large enough. Therefore, enforce an appropriate limit. Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e7e3baf7009e..0dbcde6758ea 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4401,7 +4401,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, if (copy_from_user(&getaddrs, optval, len)) return -EFAULT; - if (getaddrs.addr_num <= 0) return -EINVAL; + if (getaddrs.addr_num <= 0 || + getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr))) + return -EINVAL; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound -- cgit v1.2.3 From b9f75f45a6b46a0ab4eb0857d437a0845871f314 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jun 2008 22:16:51 -0700 Subject: netns: Don't receive new packets in a dead network namespace. Alexey Dobriyan writes: > Subject: ICMP sockets destruction vs ICMP packets oops > After icmp_sk_exit() nuked ICMP sockets, we get an interrupt. > icmp_reply() wants ICMP socket. > > Steps to reproduce: > > launch shell in new netns > move real NIC to netns > setup routing > ping -i 0 > exit from shell > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [] icmp_sk+0x17/0x30 > PGD 17f3cd067 PUD 17f3ce067 PMD 0 > Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC > CPU 0 > Modules linked in: usblp usbcore > Pid: 0, comm: swapper Not tainted 2.6.26-rc6-netns-ct #4 > RIP: 0010:[] [] icmp_sk+0x17/0x30 > RSP: 0018:ffffffff8057fc30 EFLAGS: 00010286 > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff81017c7db900 > RDX: 0000000000000034 RSI: ffff81017c7db900 RDI: ffff81017dc41800 > RBP: ffffffff8057fc40 R08: 0000000000000001 R09: 000000000000a815 > R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8057fd28 > R13: ffffffff8057fd00 R14: ffff81017c7db938 R15: ffff81017dc41800 > FS: 0000000000000000(0000) GS:ffffffff80525000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 0000000000000000 CR3: 000000017fcda000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 0, threadinfo ffffffff8053a000, task ffffffff804fa4a0) > Stack: 0000000000000000 ffff81017c7db900 ffffffff8057fcf0 ffffffff803fcfe4 > ffffffff804faa38 0000000000000246 0000000000005a40 0000000000000246 > 000000000001ffff ffff81017dd68dc0 0000000000005a40 0000000055342436 > Call Trace: > [] icmp_reply+0x44/0x1e0 > [] ? ip_route_input+0x23a/0x1360 > [] icmp_echo+0x65/0x70 > [] icmp_rcv+0x180/0x1b0 > [] ip_local_deliver+0xf4/0x1f0 > [] ip_rcv+0x33b/0x650 > [] netif_receive_skb+0x27a/0x340 > [] process_backlog+0x9d/0x100 > [] net_rx_action+0x18d/0x250 > [] __do_softirq+0x75/0x100 > [] call_softirq+0x1c/0x30 > [] do_softirq+0x65/0xa0 > [] irq_exit+0x97/0xa0 > [] do_IRQ+0xa8/0x130 > [] ? mwait_idle+0x0/0x60 > [] ret_from_intr+0x0/0xf > [] ? mwait_idle+0x4c/0x60 > [] ? mwait_idle+0x43/0x60 > [] ? cpu_idle+0x57/0xa0 > [] ? rest_init+0x70/0x80 > Code: 10 5b 41 5c 41 5d 41 5e c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 > 48 83 ec 08 48 8b 9f 78 01 00 00 e8 2b c7 f1 ff 89 c0 <48> 8b 04 c3 48 83 c4 08 > 5b c9 c3 66 66 66 66 66 2e 0f 1f 84 00 > RIP [] icmp_sk+0x17/0x30 > RSP > CR2: 0000000000000000 > ---[ end trace ea161157b76b33e8 ]--- > Kernel panic - not syncing: Aiee, killing interrupt handler! Receiving packets while we are cleaning up a network namespace is a racy proposition. It is possible when the packet arrives that we have removed some but not all of the state we need to fully process it. We have the choice of either playing wack-a-mole with the cleanup routines or simply dropping packets when we don't have a network namespace to handle them. Since the check looks inexpensive in netif_receive_skb let's just drop the incoming packets. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 11 +++++++++++ net/core/dev.c | 4 ++++ net/core/net_namespace.c | 3 +++ 3 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index aa540e6be502..d9dd0f707296 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,11 @@ extern struct list_head net_namespace_list; #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); +static inline int net_alive(struct net *net) +{ + return net && atomic_read(&net->count); +} + static inline struct net *get_net(struct net *net) { atomic_inc(&net->count); @@ -125,6 +130,12 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } #else + +static inline int net_alive(struct net *net) +{ + return 1; +} + static inline struct net *get_net(struct net *net) { return net; diff --git a/net/core/dev.c b/net/core/dev.c index 68d8df0992ab..c421a1f8f0b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2077,6 +2077,10 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); + /* Don't receive packets in an exiting network namespace */ + if (!net_alive(dev_net(skb->dev))) + goto out; + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 72b4c184dd84..7c52fe277b62 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -140,6 +140,9 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; + /* Be very certain incoming network packets will not find us */ + rcu_barrier(); + net = container_of(work, struct net, work); mutex_lock(&net_mutex); -- cgit v1.2.3 From 88a6f4ad76be425f47df7f892baf913bcd466fb3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 24 Jun 2008 13:30:45 -0700 Subject: netfilter: ip6table_mangle: don't reroute in LOCAL_IN Rerouting should only happen in LOCAL_OUT, in INPUT its useless since the packet has already chosen its final destination. Noticed by Alexey Dobriyan . Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6table_mangle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 27a5e8b48d93..f405cea21a8b 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -129,7 +129,7 @@ static struct nf_hook_ops ip6t_ops[] __read_mostly = { .priority = NF_IP6_PRI_MANGLE, }, { - .hook = ip6t_local_hook, + .hook = ip6t_route_hook, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_LOCAL_IN, -- cgit v1.2.3 From 59d393ad92f719d9ef36b96eae56d4817a7eeb10 Mon Sep 17 00:00:00 2001 From: Tony Vroon Date: Wed, 11 Jun 2008 16:23:56 -0400 Subject: mac80211: implement EU regulatory domain Implement missing EU regulatory domain for mac80211. Based on the information in IEEE 802.11-2007 (specifically pages 1142, 1143 & 1148) and ETSI 301 893 (V1.4.1). With thanks to Johannes Berg. Signed-off-by: Tony Vroon Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/reg.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 185488da2466..855bff4b3250 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -80,6 +80,23 @@ static const struct ieee80211_channel_range ieee80211_JP_channels[] = { IEEE80211_CHAN_RADAR), }; +static const struct ieee80211_channel_range ieee80211_EU_channels[] = { + /* IEEE 802.11b/g, channels 1..13 */ + RANGE_PWR(2412, 2472, 20, 6, 0), + /* IEEE 802.11a, channel 36*/ + RANGE_PWR(5180, 5180, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN), + /* IEEE 802.11a, channel 40*/ + RANGE_PWR(5200, 5200, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN), + /* IEEE 802.11a, channel 44*/ + RANGE_PWR(5220, 5220, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN), + /* IEEE 802.11a, channels 48..64 */ + RANGE_PWR(5240, 5320, 23, 6, IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_RADAR), + /* IEEE 802.11a, channels 100..140 */ + RANGE_PWR(5500, 5700, 30, 6, IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_RADAR), +}; + #define REGDOM(_code) \ { \ .code = __stringify(_code), \ @@ -90,6 +107,7 @@ static const struct ieee80211_channel_range ieee80211_JP_channels[] = { static const struct ieee80211_regdomain ieee80211_regdoms[] = { REGDOM(US), REGDOM(JP), + REGDOM(EU), }; -- cgit v1.2.3 From 00eb7fe77eb455f807c396f9917f0f623d4c84bb Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 26 Jun 2008 12:13:46 +0300 Subject: mac80211: fix an oops in several failure paths in key allocation This patch fixes an oops in several failure paths in key allocation. This Oops occurs when freeing a key that has not been linked yet, so the key->sdata is not set. Signed-off-by: Emmanuel Grumbach Signed-off-by: Tomas Winkler Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/key.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 150d66dbda9d..220e83be3ef4 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -380,6 +380,15 @@ void ieee80211_key_free(struct ieee80211_key *key) if (!key) return; + if (!key->sdata) { + /* The key has not been linked yet, simply free it + * and don't Oops */ + if (key->conf.alg == ALG_CCMP) + ieee80211_aes_key_free(key->u.ccmp.tfm); + kfree(key); + return; + } + spin_lock_irqsave(&key->sdata->local->key_lock, flags); __ieee80211_key_free(key); spin_unlock_irqrestore(&key->sdata->local->key_lock, flags); -- cgit v1.2.3 From 57413ebc4e0f1e471a3b4db4aff9a85c083d090e Mon Sep 17 00:00:00 2001 From: Miquel van Smoorenburg Date: Fri, 27 Jun 2008 17:23:57 -0700 Subject: tcp: calculate tcp_mem based on low memory instead of all memory The tcp_mem array which contains limits on the total amount of memory used by TCP sockets is calculated based on nr_all_pages. On a 32 bits x86 system, we should base this on the number of lowmem pages. Signed-off-by: Miquel van Smoorenburg Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fc54a48fde1e..850825dc86e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -260,6 +260,8 @@ #include #include #include +#include +#include #include #include #include @@ -2620,7 +2622,7 @@ __setup("thash_entries=", set_thash_entries); void __init tcp_init(void) { struct sk_buff *skb = NULL; - unsigned long limit; + unsigned long nr_pages, limit; int order, i, max_share; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -2689,8 +2691,9 @@ void __init tcp_init(void) * is up to 1/2 at 256 MB, decreasing toward zero with the amount of * memory, with a floor of 128 pages. */ - limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + nr_pages = totalram_pages - totalhigh_pages; + limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; sysctl_tcp_mem[1] = limit; -- cgit v1.2.3 From db43a282d3ec92ea45109c5551fff3dcc5afef02 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Fri, 27 Jun 2008 17:27:21 -0700 Subject: tcp: fix for splice receive when used with software LRO If an skb has nr_frags set to zero but its frag_list is not empty (as it can happen if software LRO is enabled), and a previous tcp_read_sock has consumed the linear part of the skb, then __skb_splice_bits: (a) incorrectly reports an error and (b) forgets to update the offset to account for the linear part Any of the two problems will cause the subsequent __skb_splice_bits call (the one that handles the frag_list skbs) to either skip data, or, if the unadjusted offset is greater then the size of the next skb in the frag_list, make tcp_splice_read loop forever. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/skbuff.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e556d312117..366621610e76 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1292,12 +1292,14 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, { unsigned int nr_pages = spd->nr_pages; unsigned int poff, plen, len, toff, tlen; - int headlen, seg; + int headlen, seg, error = 0; toff = *offset; tlen = *total_len; - if (!tlen) + if (!tlen) { + error = 1; goto err; + } /* * if the offset is greater than the linear part, go directly to @@ -1339,7 +1341,8 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, * just jump directly to update and return, no point * in going over fragments when the output is full. */ - if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb)) + error = spd_fill_page(spd, virt_to_page(p), plen, poff, skb); + if (error) goto done; tlen -= plen; @@ -1369,7 +1372,8 @@ map_frag: if (!plen) break; - if (spd_fill_page(spd, f->page, plen, poff, skb)) + error = spd_fill_page(spd, f->page, plen, poff, skb); + if (error) break; tlen -= plen; @@ -1382,7 +1386,10 @@ done: return 0; } err: - return 1; + /* update the offset to reflect the linear part skip, if any */ + if (!error) + *offset = toff; + return error; } /* -- cgit v1.2.3 From ec0d215f9420564fc8286dcf93d2d068bb53a07e Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Fri, 27 Jun 2008 19:34:18 -0700 Subject: af_unix: fix 'poll for write'/connected DGRAM sockets For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg routine implements a form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for PF_UNIX non-stream sockets connected to server sockets dealing with (potentially) multiple clients if the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_recvmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 657835f227d3..783317dacd30 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -487,8 +487,8 @@ static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int); static int unix_getname(struct socket *, struct sockaddr *, int *, int); static unsigned int unix_poll(struct file *, struct socket *, poll_table *); -static unsigned int unix_datagram_poll(struct file *, struct socket *, - poll_table *); +static unsigned int unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct kiocb *, struct socket *, @@ -534,7 +534,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll = unix_datagram_poll, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -555,7 +555,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll = unix_datagram_poll, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -1994,29 +1994,13 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl return mask; } -static unsigned int unix_datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { - struct sock *sk = sock->sk, *peer; - unsigned int mask; + struct sock *sk = sock->sk, *other; + unsigned int mask, writable; poll_wait(file, sk->sk_sleep, wait); - - peer = unix_peer_get(sk); - if (peer) { - if (peer != sk) { - /* - * Writability of a connected socket additionally - * depends on the state of the receive queue of the - * peer. - */ - poll_wait(file, &unix_sk(peer)->peer_wait, wait); - } else { - sock_put(peer); - peer = NULL; - } - } - mask = 0; /* exceptional events? */ @@ -2042,14 +2026,26 @@ static unsigned int unix_datagram_poll(struct file *file, struct socket *sock, } /* writable? */ - if (unix_writable(sk) && !(peer && unix_recvq_full(peer))) + writable = unix_writable(sk); + if (writable) { + other = unix_peer_get(sk); + if (other) { + if (unix_peer(other) != sk) { + poll_wait(file, &unix_sk(other)->peer_wait, + wait); + if (unix_recvq_full(other)) + writable = 0; + } + + sock_put(other); + } + } + + if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - if (peer) - sock_put(peer); - return mask; } -- cgit v1.2.3 From 5dbaec5dc6a4895db8bf9765a867418481ed7311 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Fri, 27 Jun 2008 19:35:16 -0700 Subject: netdevice: Fix typo of dev_unicast_add() comment Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c421a1f8f0b9..56b46579ff4e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2973,7 +2973,7 @@ EXPORT_SYMBOL(dev_unicast_delete); /** * dev_unicast_add - add a secondary unicast address * @dev: device - * @addr: address to delete + * @addr: address to add * @alen: length of @addr * * Add a secondary unicast address to the device or increase -- cgit v1.2.3 From 01e123d79a23000f85c4cfb12a957908c0b2c3d8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 27 Jun 2008 19:51:35 -0700 Subject: pkt_sched: ERR_PTR() ususally encodes an negative errno, not positive. Note, in the following patch, 'err' is initialized as: int err = -ENOBUFS; Signed-off-by: WANG Cong Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d355e5e47fe3..13afa7214392 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -468,7 +468,7 @@ struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) return sch; errout: - return ERR_PTR(-err); + return ERR_PTR(err); } struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops, -- cgit v1.2.3 From ede16af4cdbd21fa15d4178beb7c6fcbcccd07e9 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 27 Jun 2008 19:54:05 -0700 Subject: pkt_sched: Remove CONFIG_NET_SCH_RR Commit d62733c8e437fdb58325617c4b3331769ba82d70 ([SCHED]: Qdisc changes and sch_rr added for multiqueue) added a NET_SCH_RR option that was unused since the code went unconditionally into sch_prio. Reported-by: Robert P. J. Day Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/sched/Kconfig | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 82adfe6447d7..9437b27ff84d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -106,17 +106,6 @@ config NET_SCH_PRIO To compile this code as a module, choose M here: the module will be called sch_prio. -config NET_SCH_RR - tristate "Multi Band Round Robin Queuing (RR)" - select NET_SCH_PRIO - ---help--- - Say Y here if you want to use an n-band round robin packet - scheduler. - - The module uses sch_prio for its framework and is aliased as - sch_rr, so it will load sch_prio, although it is referred - to using sch_rr. - config NET_SCH_RED tristate "Random Early Detection (RED)" ---help--- -- cgit v1.2.3 From 7be87351a1f6430426e88b4fcde353ab3330caff Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 27 Jun 2008 20:00:19 -0700 Subject: tcp: /proc/net/tcp rto,ato values not scaled properly (v2) I found another case where we are sending information to userspace in the wrong HZ scale. This should have been fixed back in 2.5 :-( This means an ABI change but as it stands there is no way for an application like ss to get the right value. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 12695be2c255..ffe869ac1bcf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2291,7 +2291,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) } seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %p %u %u %u %u %d%n", + "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : @@ -2303,8 +2303,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) icsk->icsk_probes_out, sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, - icsk->icsk_rto, - icsk->icsk_ack.ato, + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cb46749d4c32..40ea9c36d24b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2036,7 +2036,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -2052,8 +2052,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - icsk->icsk_rto, - icsk->icsk_ack.ato, + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh ); -- cgit v1.2.3 From 10b595aff138961b520bfed51d664fd99980f6e9 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 27 Jun 2008 20:02:14 -0700 Subject: netlink: Fix some doc comments in net/netlink/attr.c Fix some doc comments to match function and attribute names in net/netlink/attr.c. Signed-off-by: Julius Volz Signed-off-by: David S. Miller --- net/netlink/attr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netlink/attr.c b/net/netlink/attr.c index 47bbf45ae5d7..2d106cfe1d27 100644 --- a/net/netlink/attr.c +++ b/net/netlink/attr.c @@ -132,6 +132,7 @@ errout: * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream + * @policy: validation policy * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessable via the attribute type. Attributes with a type @@ -194,7 +195,7 @@ struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) /** * nla_strlcpy - Copy string attribute payload into a sized buffer * @dst: where to copy the string to - * @src: attribute to copy the string from + * @nla: attribute to copy the string from * @dstsize: size of destination buffer * * Copies at most dstsize - 1 bytes into the destination buffer. @@ -340,9 +341,9 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) } /** - * nla_reserve - reserve room for attribute without header + * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on - * @len: length of attribute payload + * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * -- cgit v1.2.3 From 9a375803feaadb6c34e0807bd9325885dcca5c00 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 27 Jun 2008 20:06:08 -0700 Subject: inet fragments: fix race between inet_frag_find and inet_frag_secret_rebuild The problem is that while we work w/o the inet_frags.lock even read-locked the secret rebuild timer may occur (on another CPU, since BHs are still disabled in the inet_frag_find) and change the rnd seed for ipv4/6 fragments. It was caused by my patch fd9e63544cac30a34c951f0ec958038f0529e244 ([INET]: Omit double hash calculations in xxx_frag_intern) late in the 2.6.24 kernel, so this should probably be queued to -stable. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 16 +++++++++++----- net/ipv4/ip_fragment.c | 2 ++ net/ipv6/netfilter/nf_conntrack_reasm.c | 3 ++- net/ipv6/reassembly.c | 2 ++ 4 files changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4ed429bd5951..0546a0bc97ea 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -192,14 +192,21 @@ EXPORT_SYMBOL(inet_frag_evictor); static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, - unsigned int hash, void *arg) + void *arg) { struct inet_frag_queue *qp; #ifdef CONFIG_SMP struct hlist_node *n; #endif + unsigned int hash; write_lock(&f->lock); + /* + * While we stayed w/o the lock other CPU could update + * the rnd seed, so we need to re-calculate the hash + * chain. Fortunatelly the qp_in can be used to get one. + */ + hash = f->hashfn(qp_in); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we @@ -247,7 +254,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg, unsigned int hash) + struct inet_frags *f, void *arg) { struct inet_frag_queue *q; @@ -255,7 +262,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, if (q == NULL) return NULL; - return inet_frag_intern(nf, q, f, hash, arg); + return inet_frag_intern(nf, q, f, arg); } struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, @@ -264,7 +271,6 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *q; struct hlist_node *n; - read_lock(&f->lock); hlist_for_each_entry(q, n, &f->hash[hash], list) { if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); @@ -274,6 +280,6 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, } read_unlock(&f->lock); - return inet_frag_create(nf, f, key, hash); + return inet_frag_create(nf, f, key); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cd6ce6ac6358..37221f659159 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -229,6 +229,8 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; + + read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index e65e26e210ee..cf20bc4fd60d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -207,9 +207,10 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) arg.id = id; arg.src = src; arg.dst = dst; + + read_lock_bh(&nf_frags.lock); hash = ip6qhashfn(id, src, dst); - local_bh_disable(); q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); local_bh_enable(); if (q == NULL) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 798cabc7535b..a60d7d129713 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -247,6 +247,8 @@ fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst, arg.id = id; arg.src = src; arg.dst = dst; + + read_lock(&ip6_frags.lock); hash = ip6qhashfn(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); -- cgit v1.2.3 From 251a4b320f2352598f84e4452ab538aa8064af52 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Fri, 27 Jun 2008 20:09:00 -0700 Subject: net/inet_lro: remove setting skb->ip_summed when not LRO-able When an SKB cannot be chained to a session, the current code attempts to "restore" its ip_summed field from lro_mgr->ip_summed. However, lro_mgr->ip_summed does not hold the original value; in fact, we'd better not touch skb->ip_summed since it is not modified by the code in the path leading to a failure to chain it. Also use a cleaer comment to the describe the ip_summed field of struct net_lro_mgr. Issue raised by Or Gerlitz Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- include/linux/inet_lro.h | 6 +++++- net/ipv4/inet_lro.c | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h index 80335b7d77c5..c4335faebb63 100644 --- a/include/linux/inet_lro.h +++ b/include/linux/inet_lro.h @@ -84,7 +84,11 @@ struct net_lro_mgr { from received packets and eth protocol is still ETH_P_8021Q */ - u32 ip_summed; /* Set in non generated SKBs in page mode */ + /* + * Set for generated SKBs that are not added to + * the frag list in fragmented mode + */ + u32 ip_summed; u32 ip_summed_aggr; /* Set in aggregated SKBs: CHECKSUM_UNNECESSARY * or CHECKSUM_NONE */ diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 4a4d49fca1f2..cfd034a2b96e 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -383,8 +383,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, out2: /* send aggregated SKBs to stack */ lro_flush(lro_mgr, lro_desc); -out: /* Original SKB has to be posted to stack */ - skb->ip_summed = lro_mgr->ip_summed; +out: return 1; } -- cgit v1.2.3 From 59d88c00cafe5192b058abf4f3ce17c2e27d1c09 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 27 Jun 2008 20:12:32 -0700 Subject: netlabel: Fix a problem when dumping the default IPv6 static labels There is a missing "!" in a conditional statement which is causing entries to be skipped when dumping the default IPv6 static label entries. This can be demonstrated by running the following: # netlabelctl unlbl add default address:::1 \ label:system_u:object_r:unlabeled_t:s0 # netlabelctl -p unlbl list ... you will notice that the entry for the IPv6 localhost address is not displayed but does exist (works correctly, causes collisions when attempting to add duplicate entries, etc.). Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 0099da5b2591..52b2611a6eb6 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1534,7 +1534,7 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, } } list_for_each_entry_rcu(addr6, &iface->addr6_list, list) { - if (addr6->valid || iter_addr6++ < skip_addr6) + if (!addr6->valid || iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, -- cgit v1.2.3 From d420895efb259a78dda50f95289571faa6e10e41 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 27 Jun 2008 20:14:54 -0700 Subject: ipv6 route: Convert rt6_device_match() to use RT6_LOOKUP_F_xxx flags. The commit 77d16f450ae0452d7d4b009f78debb1294fb435c ("[IPV6] ROUTE: Unify RT6_F_xxx and RT6_SELECT_F_xxx flags") intended to pass various routing lookup hints around RT6_LOOKUP_F_xxx flags, but conversion was missing for rt6_device_match(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d1f3e19b06c7..7ff687020fa9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -240,7 +240,7 @@ static inline int rt6_need_strict(struct in6_addr *daddr) static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *rt, int oif, - int strict) + int flags) { struct rt6_info *local = NULL; struct rt6_info *sprt; @@ -253,7 +253,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (dev->flags & IFF_LOOPBACK) { if (sprt->rt6i_idev == NULL || sprt->rt6i_idev->dev->ifindex != oif) { - if (strict && oif) + if (flags & RT6_LOOKUP_F_IFACE && oif) continue; if (local && (!oif || local->rt6i_idev->dev->ifindex == oif)) @@ -266,7 +266,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (local) return local; - if (strict) + if (flags & RT6_LOOKUP_F_IFACE) return net->ipv6.ip6_null_entry; } return rt; -- cgit v1.2.3