From 07dc8bc9a6b15f54d3ad962af74a096c7d7b42b4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Nov 2017 10:08:01 +0000 Subject: netfilter: remove redundant assignment to e The assignment to variable e is redundant since the same assignment occurs just a few lines later, hence it can be removed. Cleans up clang warning for arp_tables, ip_tables and ip6_tables: warning: Value stored to 'e' is never read Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv6/netfilter/ip6_tables.c | 1 - 3 files changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f88221aebc9d..0c3c944a7b72 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -373,7 +373,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4cbe5e80f3bf..2e0d339028bb 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -439,7 +439,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f06e25065a34..1d7ae9366335 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -458,7 +458,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; -- cgit v1.2.3 From 613d0776d3fe7eb28c695a63a5533a1ec8258c86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 14:32:37 +0300 Subject: netfilter: exit_net cleanup check added Be sure that lists initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 + net/netfilter/nf_tables_api.c | 7 +++++++ net/netfilter/nfnetlink_log.c | 5 +++++ net/netfilter/nfnetlink_queue.c | 5 +++++ net/netfilter/x_tables.c | 9 +++++++++ 5 files changed, 27 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 17b4ca562944..e35b8d074f06 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -819,6 +819,7 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); + WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d8327b43e4dc..10798b357481 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5847,6 +5847,12 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_exit_net(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->nft.af_info)); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); +} + int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; @@ -5917,6 +5923,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cad6498f10b0..1f511ed0fea3 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1093,10 +1093,15 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { + struct nfnl_log_net *log = nfnl_log_pernet(net); + unsigned int i; + #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index a16356cacec3..c09b36755ed7 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1512,10 +1512,15 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int i; + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a77dd514297c..55802e97f906 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1729,8 +1729,17 @@ static int __net_init xt_net_init(struct net *net) return 0; } +static void __net_exit xt_net_exit(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); +} + static struct pernet_operations xt_net_ops = { .init = xt_net_init, + .exit = xt_net_exit, }; static int __init xt_init(void) -- cgit v1.2.3 From bc7d811ace4ad39a3941089ca871633366878719 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:40 +0100 Subject: netfilter: nf_ct_h323: Convert CHECK_BOUND macro to function It is bad practive to return in a macro, this patch moves the check into a function. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 94 +++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index cf1bf2605c10..3d9a009ac147 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -103,7 +103,6 @@ struct bitstr { #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} -#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) static unsigned int get_len(struct bitstr *bs); static unsigned int get_bit(struct bitstr *bs); static unsigned int get_bits(struct bitstr *bs, unsigned int b); @@ -165,6 +164,14 @@ static unsigned int get_len(struct bitstr *bs) return v; } +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +{ + if (*bs->cur + bytes > *bs->end) + return 1; + + return 0; +} + /****************************************************************************/ static unsigned int get_bit(struct bitstr *bs) { @@ -280,7 +287,8 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, INC_BIT(bs); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -293,11 +301,14 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; + len = *bs->cur++; bs->cur += len; + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; - CHECK_BOUND(bs, 0); return H323_ERROR_NONE; } @@ -330,7 +341,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; break; @@ -341,7 +353,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -357,7 +370,8 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -375,12 +389,14 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); break; default: @@ -391,7 +407,8 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -409,7 +426,8 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -440,12 +458,14 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -458,7 +478,8 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -473,7 +494,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -484,7 +506,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -525,9 +548,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -556,7 +581,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get the extension bitmap */ bmp2_len = get_bits(bs, 7) + 1; - CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) @@ -567,9 +593,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; continue; } @@ -583,9 +611,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -623,19 +653,22 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = get_len(bs); break; default: @@ -659,7 +692,8 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, if (son->attr & OPEN) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -728,7 +762,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; } @@ -743,7 +778,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); -- cgit v1.2.3 From ec8a8f3c31ddef0a7d9626c4b8a4baa30f3b80aa Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:41 +0100 Subject: netfilter: nf_ct_h323: Extend nf_h323_error_boundary to work on bits as well This patch fixes several out of bounds memory reads by extending the nf_h323_error_boundary() function to work on bits as well an check the affected parts. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 92 +++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 3d9a009ac147..dc6347342e34 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -164,8 +164,13 @@ static unsigned int get_len(struct bitstr *bs) return v; } -static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) { + bits += bs->bit; + bytes += bits / BITS_PER_BYTE; + if (bits % BITS_PER_BYTE > 0) + bytes++; + if (*bs->cur + bytes > *bs->end) return 1; @@ -286,8 +291,7 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); INC_BIT(bs); - - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -301,12 +305,12 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = *bs->cur++; bs->cur += len; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; @@ -330,6 +334,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, bs->cur += 2; break; case CONS: /* 64K < Range < 4G */ + if (nf_h323_error_boundary(bs, 0, 2)) + return H323_ERROR_BOUND; len = get_bits(bs, 2) + 1; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ @@ -341,7 +347,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; @@ -353,7 +359,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -370,7 +376,7 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -389,13 +395,13 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); break; @@ -407,7 +413,7 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -421,12 +427,14 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -458,17 +466,19 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -478,7 +488,7 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -494,11 +504,13 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -506,7 +518,7 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -526,9 +538,13 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Extensible? */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; ext = (f->attr & EXT) ? get_bit(bs) : 0; /* Get fields bitmap */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -548,10 +564,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -580,8 +596,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; /* Get the extension bitmap */ + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; bmp2_len = get_bits(bs, 7) + 1; - if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; @@ -593,10 +611,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; continue; @@ -611,10 +629,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", @@ -653,13 +671,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; @@ -667,11 +685,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = get_len(bs); break; default: + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; count = get_bits(bs, f->sz); break; } @@ -691,8 +711,10 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, for (i = 0; i < count; i++) { if (son->attr & OPEN) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -744,11 +766,17 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Decode the choice index number */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; if ((f->attr & EXT) && get_bit(bs)) { ext = 1; + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; type = get_bits(bs, 7) + f->lb; } else { ext = 0; + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; type = get_bits(bs, f->sz); if (type >= f->lb) return H323_ERROR_RANGE; @@ -761,8 +789,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, /* Check Range */ if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; @@ -777,8 +807,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", -- cgit v1.2.3 From fbcd253d2448b8f168241e38f629a36c4c8c1e94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 19 Nov 2017 21:27:28 +0100 Subject: netfilter: conntrack: lower timeout to RETRANS seconds if window is 0 When zero window is announced we can get into a situation where connection stays around forever: 1. One side announces zero window. 2. Other side closes. In this case, no FIN is sent (stuck in send queue). Unless other side opens the window up again conntrack stays in ESTABLISHED state for a very long time. Lets alleviate this by lowering the timeout to RETRANS (5 minutes), the other end should be sending zero window probes to keep the connection established as long as a socket still exists. Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b12fc07111d0..37ef35b861f2 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1039,6 +1039,9 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; + else if (ct->proto.tcp.last_win == 0 && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); -- cgit v1.2.3 From 8b1836c4b64386e9bc580438cae386ed31a43ab9 Mon Sep 17 00:00:00 2001 From: Jay Elliott Date: Wed, 15 Nov 2017 15:01:13 -0800 Subject: netfilter: conntrack: clamp timeouts to INT_MAX When the conntracking code multiplies a timeout by HZ, it can overflow from positive to negative; this causes it to instantly expire. To protect against this the multiplication is done in 64-bit so we can prevent it from exceeding INT_MAX. Signed-off-by: Jay Elliott Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..66d72a8fa87f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1566,9 +1566,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - ct->timeout = nfct_time_stamp + timeout * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = nfct_time_stamp + (u32)timeout; if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; @@ -1768,6 +1770,7 @@ ctnetlink_create_conntrack(struct net *net, int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; + u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1776,7 +1779,10 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = (u32)timeout + nfct_time_stamp; rcu_read_lock(); if (cda[CTA_HELP]) { -- cgit v1.2.3 From 6a53b7593233ab9e4f96873ebacc0f653a55c3e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 27 Nov 2017 11:15:16 -0800 Subject: xfrm: check id proto in validate_tmpl() syzbot reported a kernel warning in xfrm_state_fini(), which indicates that we have entries left in the list net->xfrm.state_all whose proto is zero. And xfrm_id_proto_match() doesn't consider them as a match with IPSEC_PROTO_ANY in this case. Proto with value 0 is probably not a valid value, at least verify_newsa_info() doesn't consider it valid either. This patch fixes it by checking the proto value in validate_tmpl() and rejecting invalid ones, like what iproute2 does in xfrm_xfrmproto_getbyname(). Reported-by: syzbot Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Cong Wang Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 983b0233767b..c2cfcc6fdb34 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1445,6 +1445,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) default: return -EINVAL; } + + switch (ut[i].id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: +#endif + case IPSEC_PROTO_ANY: + break; + default: + return -EINVAL; + } + } return 0; -- cgit v1.2.3 From 3d7682af228fd78dc46bc6bf40e0268ad04521ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Nov 2017 14:25:50 +0000 Subject: rxrpc: Clean up whitespace Clean up some whitespace from rxrpc. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index bda952ffe6a6..555274ddc514 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -426,7 +426,7 @@ recheck_state: next = call->expect_rx_by; #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } - + set(call->expect_req_by); set(call->expect_term_by); set(call->ack_at); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1aad04a32d5e..c628351eb900 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -424,7 +424,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (earliest != now + MAX_JIFFY_OFFSET) { _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_set_service_reap_timer(rxnet, earliest); + rxrpc_set_service_reap_timer(rxnet, earliest); } while (!list_empty(&graveyard)) { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 23a5e61d8f79..6fc61400337f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -976,7 +976,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, rxrpc_reduce_call_timer(call, expect_rx_by, now, rxrpc_timer_set_for_normal); } - + switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb, skew); @@ -1213,7 +1213,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reupgrade; conn->service_id = sp->hdr.serviceId; } - + if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); -- cgit v1.2.3 From 5fc62f6a139a7b06b027bf442cd4205619506f59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Nov 2017 14:40:41 +0000 Subject: rxrpc: Fix ACK generation from the connection event processor Repeat terminal ACKs and now terminal ACKs are now generated from the connection event processor rather from call handling as this allows us to discard client call structures as soon as possible and free up the channel for a follow on call. However, in ACKs so generated, the additional information trailer is malformed because the padding that's meant to be in the middle isn't included in what's transmitted. Fix it so that the 3 bytes of padding are included in the transmission. Further, the trailer is misaligned because of the padding, so assigment to the u16 and u32 fields inside it might cause problems on some arches, so fix this by breaking the padding and the trailer out of the packed struct. (This also deals with potential compiler weirdies where some of the nested structs are packed and some aren't). The symptoms can be seen in wireshark as terminal DUPLICATE or IDLE ACK packets in which the Max MTU, Interface MTU and rwind fields have weird values and the Max Packets field is apparently missing. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- net/rxrpc/conn_event.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9e9a8db1bc9c..4ca11be6be3c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -30,22 +30,18 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; struct msghdr msg; - struct kvec iov; + struct kvec iov[3]; struct { struct rxrpc_wire_header whdr; union { - struct { - __be32 code; - } abort; - struct { - struct rxrpc_ackpacket ack; - u8 padding[3]; - struct rxrpc_ackinfo info; - }; + __be32 abort_code; + struct rxrpc_ackpacket ack; }; } __attribute__((packed)) pkt; + struct rxrpc_ackinfo ack_info; size_t len; - u32 serial, mtu, call_id; + int ioc; + u32 serial, mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -66,6 +62,13 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt.whdr); + iov[1].iov_base = &padding; + iov[1].iov_len = 3; + iov[2].iov_base = &ack_info; + iov[2].iov_len = sizeof(ack_info); + pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(conn->proto.cid); pkt.whdr.callNumber = htonl(call_id); @@ -80,8 +83,10 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, len = sizeof(pkt.whdr); switch (chan->last_type) { case RXRPC_PACKET_TYPE_ABORT: - pkt.abort.code = htonl(chan->last_abort); - len += sizeof(pkt.abort); + pkt.abort_code = htonl(chan->last_abort); + iov[0].iov_len += sizeof(pkt.abort_code); + len += sizeof(pkt.abort_code); + ioc = 1; break; case RXRPC_PACKET_TYPE_ACK: @@ -94,13 +99,19 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - pkt.info.rxMTU = htonl(rxrpc_rx_mtu); - pkt.info.maxMTU = htonl(mtu); - pkt.info.rwind = htonl(rxrpc_rx_window_size); - pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + ack_info.rxMTU = htonl(rxrpc_rx_mtu); + ack_info.maxMTU = htonl(mtu); + ack_info.rwind = htonl(rxrpc_rx_window_size); + ack_info.jumbo_max = htonl(rxrpc_rx_jumbo_max); pkt.whdr.flags |= RXRPC_SLOW_START_OK; - len += sizeof(pkt.ack) + sizeof(pkt.info); + padding = 0; + iov[0].iov_len += sizeof(pkt.ack); + len += sizeof(pkt.ack) + 3 + sizeof(ack_info); + ioc = 3; break; + + default: + return; } /* Resync with __rxrpc_disconnect_call() and check that the last call @@ -110,9 +121,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, if (READ_ONCE(chan->last_call) != call_id) return; - iov.iov_base = &pkt; - iov.iov_len = len; - serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); @@ -127,7 +135,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; } - kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len); + kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); _leave(""); return; } -- cgit v1.2.3 From 282ef4729195c8503f7101d574acfb5e7c8a8209 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 28 Nov 2017 11:28:52 -0600 Subject: rxrpc: Fix variable overwrite Values assigned to both variable resend_at and ack_at are overwritten before they can be used. The correct fix here is to add 'now' to the previously computed value in resend_at and ack_at. Addresses-Coverity-ID: 1462262 Addresses-Coverity-ID: 1462263 Addresses-Coverity-ID: 1462264 Fixes: beb8e5e4f38c ("rxrpc: Express protocol timeouts in terms of RTT") Link: https://marc.info/?i=17004.1511808959%40warthog.procyon.org.uk Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 555274ddc514..ad2ab1103189 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,7 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; - ack_at = jiffies + expiry; + ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); rxrpc_reduce_call_timer(call, ack_at, now, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a1c53ac066a1..09f2a3e05221 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -233,7 +233,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, if (resend_at < 1) resend_at = 1; - resend_at = now + rxrpc_resend_timeout; + resend_at += now; WRITE_ONCE(call->resend_at, resend_at); rxrpc_reduce_call_timer(call, resend_at, now, rxrpc_timer_set_for_send); -- cgit v1.2.3 From 90a6ec85351b31449c2c6b5406b5396ac96f191d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 Nov 2017 16:07:51 -0800 Subject: act_sample: get rid of tcf_sample_cleanup_rcu() Similar to commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu"), TC actions don't need to respect RCU grace period, because it is either just detached from tc filter (standalone case) or it is removed together with tc filter (bound case) in which case RCU grace period is already respected at filter layer. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Reported-by: Eric Dumazet Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Yotam Gigi Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tc_act/tc_sample.h | 1 - net/sched/act_sample.c | 14 +++----------- 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index 524cee4f4c81..01dbfea32672 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -14,7 +14,6 @@ struct tcf_sample { struct psample_group __rcu *psample_group; u32 psample_group_num; struct list_head tcfm_list; - struct rcu_head rcu; }; #define to_sample(a) ((struct tcf_sample *)a) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 8b5abcd2f32f..9438969290a6 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -96,23 +96,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) +static void tcf_sample_cleanup(struct tc_action *a, int bind) { - struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); + struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rcu_dereference_protected(s->psample_group, 1); + psample_group = rtnl_dereference(s->psample_group); RCU_INIT_POINTER(s->psample_group, NULL); psample_group_put(psample_group); } -static void tcf_sample_cleanup(struct tc_action *a, int bind) -{ - struct tcf_sample *s = to_sample(a); - - call_rcu(&s->rcu, tcf_sample_cleanup_rcu); -} - static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { @@ -264,7 +257,6 @@ static int __init sample_init_module(void) static void __exit sample_cleanup_module(void) { - rcu_barrier(); tcf_unregister_action(&act_sample_ops, &sample_net_ops); } -- cgit v1.2.3 From 3016dad75b48279e579117ee3ed566ba90a3b023 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Nov 2017 17:43:57 -0800 Subject: tcp: remove buggy call to tcp_v6_restore_cb() tcp_v6_send_reset() expects to receive an skb with skb->cb[] layout as used in TCP stack. MD5 lookup uses tcp_v6_iif() and tcp_v6_sdif() and thus TCP_SKB_CB(skb)->header.h6 This patch probably fixes RST packets sent on behalf of a timewait md5 ipv6 socket. Before Florian patch, tcp_v6_restore_cb() was needed before jumping to no_tcp_socket label. Fixes: 271c3b9b7bda ("tcp: honour SO_BINDTODEVICE for TW_RST case too") Signed-off-by: Eric Dumazet Cc: Florian Westphal Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6bb98c93edfe..be11dc13aa70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1590,7 +1590,6 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v6_restore_cb(skb); tcp_v6_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; -- cgit v1.2.3 From f859b4af1c52493ec21173ccc73d0b60029b5b88 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Nov 2017 10:41:14 +0800 Subject: sit: update frag_off info After parsing the sit netlink change info, we forget to update frag_off in ipip6_tunnel_update(). Fix it by assigning frag_off with new value. Reported-by: Jianlin Shi Signed-off-by: Hangbin Liu Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d60ddcb0bfe2..d7dc23c1b2ca 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1098,6 +1098,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; + t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; -- cgit v1.2.3 From e719135881f00c01ca400abb8a5dadaf297a24f9 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 29 Nov 2017 18:23:56 +0100 Subject: xfrm: fix XFRMA_OUTPUT_MARK policy entry This seems to be an obvious typo, NLA_U32 is type of the attribute, not its (minimal) length. Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.") Signed-off-by: Michal Kubecek Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c2cfcc6fdb34..ff58c37469d6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2485,7 +2485,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, + [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { -- cgit v1.2.3 From 4ce3dbe397d7b6b15f272ae757c78c35e9e4b61d Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Tue, 28 Nov 2017 19:55:40 +0200 Subject: xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0) Code path when (encap_type < 0) does not verify the state is valid before progressing. This will result in a crash if, for instance, x->km.state == XFRM_STATE_ACQ. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Signed-off-by: Aviv Heller Signed-off-by: Yevgeny Kliteynik Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 347ab31574d5..da6447389ffb 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -207,7 +207,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) xfrm_address_t *daddr; struct xfrm_mode *inner_mode; u32 mark = skb->mark; - unsigned int family; + unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; @@ -216,6 +216,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (encap_type < 0) { x = xfrm_input_state(skb); + + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); + goto drop; + } + family = x->outer_mode->afinfo->family; /* An encap_type of -1 indicates async resumption. */ -- cgit v1.2.3 From ddc47e4404b58f03e98345398fb12d38fe291512 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 29 Nov 2017 06:53:55 +0100 Subject: xfrm: Fix stack-out-of-bounds read on socket policy lookup. When we do tunnel or beet mode, we pass saddr and daddr from the template to xfrm_state_find(), this is ok. On transport mode, we pass the addresses from the flowi, assuming that the IP addresses (and address family) don't change during transformation. This assumption is wrong in the IPv4 mapped IPv6 case, packet is IPv4 and template is IPv6. Fix this by catching address family missmatches of the policy and the flow already before we do the lookup. Reported-by: syzbot Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9542975eb2f9..038ec68f6901 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1168,9 +1168,15 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { - bool match = xfrm_selector_match(&pol->selector, fl, family); + bool match; int err = 0; + if (pol->family != family) { + pol = NULL; + goto out; + } + + match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { pol = NULL; -- cgit v1.2.3 From fe77d8257c4d838c5976557ddb87bd789f312412 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:25:02 +0100 Subject: batman-adv: Always initialize fragment header priority The batman-adv unuicast fragment header contains 3 bits for the priority of the packet. These bits will be initialized when the skb->priority contains a value between 256 and 263. But otherwise, the uninitialized bits from the stack will be used. Fixes: c0f25c802b33 ("batman-adv: Include frame priority in fragment header") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a98cf1104a30..ebe6e38934e4 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -499,6 +499,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ if (skb->priority >= 256 && skb->priority <= 263) frag_header.priority = skb->priority - 256; + else + frag_header.priority = 0; ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); -- cgit v1.2.3 From 198a62ddffa4a4ffaeb741f642b7b52f2d91ae9b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:50:42 +0100 Subject: batman-adv: Fix check of retrieved orig_gw in batadv_v_gw_is_eligible The batadv_v_gw_is_eligible function already assumes that orig_node is not NULL. But batadv_gw_node_get may have failed to find the originator. It must therefore be checked whether the batadv_gw_node_get failed and not whether orig_node is NULL to detect this error. Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 341ceab8338d..e0e2bfcd6b3e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -814,7 +814,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, } orig_gw = batadv_gw_node_get(bat_priv, orig_node); - if (!orig_node) + if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) -- cgit v1.2.3 From d30fc5126efb0c33b7adf5966d3051db2c3d7721 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:34 +0800 Subject: sctp: only update outstanding_bytes for transmitted queue when doing prsctp_prune Now outstanding_bytes is only increased when appending chunks into one packet and sending it at 1st time, while decreased when it is about to move into retransmit queue. It means outstanding_bytes value is already decreased for all chunks in retransmit queue. However sctp_prsctp_prune_sent is a common function to check the chunks in both transmitted and retransmit queue, it decrease outstanding_bytes when moving a chunk into abandoned queue from either of them. It could cause outstanding_bytes underflow, as it also decreases it's value for the chunks in retransmit queue. This patch fixes it by only updating outstanding_bytes for transmitted queue when pruning queues for prsctp prio policy, the same fix is also needed in sctp_check_transmitted. Fixes: 8dbdf1f5b09c ("sctp: implement prsctp PRIO policy") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4db012aa25f7..7029f8b99063 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -377,7 +377,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - if (!chk->tsn_gap_acked) { + if (queue != &asoc->outqueue.retransmit && + !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); @@ -1434,7 +1435,8 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ - if (!tchunk->tsn_gap_acked) { + if (transmitted_queue != &q->retransmit && + !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); -- cgit v1.2.3 From e5f612969c6f965e3bd1158598e0a3b1c4f389b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:35 +0800 Subject: sctp: abandon the whole msg if one part of a fragmented message is abandoned As rfc3758#section-3.1 demands: A3) When a TSN is "abandoned", if it is part of a fragmented message, all other TSN's within that fragmented message MUST be abandoned at the same time. Besides, if it couldn't handle this, the rest frags would never get assembled in peer side. This patch supports it by adding abandoned flag in sctp_datamsg, when one chunk is being abandoned, set chunk->msg->abandoned as well. Next time when checking for abandoned, go checking chunk->msg->abandoned first. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- net/sctp/chunk.c | 7 +++++++ net/sctp/outqueue.c | 12 ++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 16f949eef52f..2f8f93da5dc2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -503,7 +503,8 @@ struct sctp_datamsg { /* Did the messenge fail to send? */ int send_error; u8 send_failed:1, - can_delay; /* should this message be Nagle delayed */ + can_delay:1, /* should this message be Nagle delayed */ + abandoned:1; /* should this message be abandoned */ }; struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 7b261afc47b9..9213805b558d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -53,6 +53,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg) msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; + msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } @@ -304,6 +305,9 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (!chunk->asoc->peer.prsctp_capable) return 0; + if (chunk->msg->abandoned) + return 1; + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = @@ -316,6 +320,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } + chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { @@ -324,10 +329,12 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { + chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7029f8b99063..4ab164b5aad0 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -364,10 +364,12 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); @@ -404,10 +406,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; -- cgit v1.2.3 From 779edd7348878a7376c0e3d0f96485c30b5f1b7d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:36 +0800 Subject: sctp: do not abandon the other frags in unsent outq if one msg has outstanding frags Now for the abandoned chunks in unsent outq, it would just free the chunks. Because no tsn is assigned to them yet, there's no need to send fwd tsn to peer, unlike for the abandoned chunks in sent outq. The problem is when parts of the msg have been sent and the other frags are still in unsent outq, if they are abandoned/dropped, the peer would never get this msg reassembled. So these frags in unsent outq can't be dropped if this msg already has outstanding frags. This patch does the check in sctp_chunk_abandoned and sctp_prsctp_prune_unsent. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 4 ++++ net/sctp/outqueue.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 9213805b558d..7f8baa48e7c2 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -308,6 +308,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->msg->abandoned) return 1; + if (!chunk->has_tsn && + !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) + return 0; + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4ab164b5aad0..7d67feeeffc1 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -407,7 +407,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && - (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || + !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; -- cgit v1.2.3 From cfac7f836a715b91f08c851df915d401a4d52783 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Dec 2017 10:06:56 -0800 Subject: tcp/dccp: block bh before arming time_wait timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maciej Żenczykowski reported some panics in tcp_twsk_destructor() that might be caused by the following bug. timewait timer is pinned to the cpu, because we want to transition timwewait refcount from 0 to 4 in one go, once everything has been initialized. At the time commit ed2e92394589 ("tcp/dccp: fix timewait races in timer handling") was merged, TCP was always running from BH habdler. After commit 5413d1babe8f ("net: do not block BH while processing socket backlog") we definitely can run tcp_time_wait() from process context. We need to block BH in the critical section so that the pinned timer has still its purpose. This bug is more likely to happen under stress and when very small RTO are used in datacenter flows. Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Acked-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/dccp/minisocks.c | 6 ++++++ net/ipv4/tcp_minisocks.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index abd07a443219..178bb9833311 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -57,10 +57,16 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e36eff0403f4..b079b619b60c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -310,10 +310,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than -- cgit v1.2.3 From c7799c067c2ae33e348508c8afec354f3257ff25 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Wed, 29 Nov 2017 12:48:42 +0200 Subject: tipc: call tipc_rcv() only if bearer is up in tipc_udp_recv() Remove the second tipc_rcv() call in tipc_udp_recv(). We have just checked that the bearer is not up, and calling tipc_rcv() with a bearer that is not up leads to a TIPC div-by-zero crash in tipc_node_calculate_timer(). The crash is rare in practice, but can happen like this: We're enabling a bearer, but it's not yet up and fully initialized. At the same time we receive a discovery packet, and in tipc_udp_recv() we end up calling tipc_rcv() with the not-yet-initialized bearer, causing later the div-by-zero crash in tipc_node_calculate_timer(). Jon Maloy explains the impact of removing the second tipc_rcv() call: "link setup in the worst case will be delayed until the next arriving discovery messages, 1 sec later, and this is an acceptable delay." As the tipc_rcv() call is removed, just leave the function via the rcu_out label, so that we will kfree_skb(). [ 12.590450] Own node address <1.1.1>, network identity 1 [ 12.668088] divide error: 0000 [#1] SMP [ 12.676952] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.14.2-dirty #1 [ 12.679225] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 [ 12.682095] task: ffff8c2a761edb80 task.stack: ffffa41cc0cac000 [ 12.684087] RIP: 0010:tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] [ 12.686486] RSP: 0018:ffff8c2a7fc838a0 EFLAGS: 00010246 [ 12.688451] RAX: 0000000000000000 RBX: ffff8c2a5b382600 RCX: 0000000000000000 [ 12.691197] RDX: 0000000000000000 RSI: ffff8c2a5b382600 RDI: ffff8c2a5b382600 [ 12.693945] RBP: ffff8c2a7fc838b0 R08: 0000000000000001 R09: 0000000000000001 [ 12.696632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8c2a5d8949d8 [ 12.699491] R13: ffffffff95ede400 R14: 0000000000000000 R15: ffff8c2a5d894800 [ 12.702338] FS: 0000000000000000(0000) GS:ffff8c2a7fc80000(0000) knlGS:0000000000000000 [ 12.705099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 12.706776] CR2: 0000000001bb9440 CR3: 00000000bd009001 CR4: 00000000003606e0 [ 12.708847] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 12.711016] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 12.712627] Call Trace: [ 12.713390] [ 12.714011] tipc_node_check_dest+0x2e8/0x350 [tipc] [ 12.715286] tipc_disc_rcv+0x14d/0x1d0 [tipc] [ 12.716370] tipc_rcv+0x8b0/0xd40 [tipc] [ 12.717396] ? minmax_running_min+0x2f/0x60 [ 12.718248] ? dst_alloc+0x4c/0xa0 [ 12.718964] ? tcp_ack+0xaf1/0x10b0 [ 12.719658] ? tipc_udp_is_known_peer+0xa0/0xa0 [tipc] [ 12.720634] tipc_udp_recv+0x71/0x1d0 [tipc] [ 12.721459] ? dst_alloc+0x4c/0xa0 [ 12.722130] udp_queue_rcv_skb+0x264/0x490 [ 12.722924] __udp4_lib_rcv+0x21e/0x990 [ 12.723670] ? ip_route_input_rcu+0x2dd/0xbf0 [ 12.724442] ? tcp_v4_rcv+0x958/0xa40 [ 12.725039] udp_rcv+0x1a/0x20 [ 12.725587] ip_local_deliver_finish+0x97/0x1d0 [ 12.726323] ip_local_deliver+0xaf/0xc0 [ 12.726959] ? ip_route_input_noref+0x19/0x20 [ 12.727689] ip_rcv_finish+0xdd/0x3b0 [ 12.728307] ip_rcv+0x2ac/0x360 [ 12.728839] __netif_receive_skb_core+0x6fb/0xa90 [ 12.729580] ? udp4_gro_receive+0x1a7/0x2c0 [ 12.730274] __netif_receive_skb+0x1d/0x60 [ 12.730953] ? __netif_receive_skb+0x1d/0x60 [ 12.731637] netif_receive_skb_internal+0x37/0xd0 [ 12.732371] napi_gro_receive+0xc7/0xf0 [ 12.732920] receive_buf+0x3c3/0xd40 [ 12.733441] virtnet_poll+0xb1/0x250 [ 12.733944] net_rx_action+0x23e/0x370 [ 12.734476] __do_softirq+0xc5/0x2f8 [ 12.734922] irq_exit+0xfa/0x100 [ 12.735315] do_IRQ+0x4f/0xd0 [ 12.735680] common_interrupt+0xa2/0xa2 [ 12.736126] [ 12.736416] RIP: 0010:native_safe_halt+0x6/0x10 [ 12.736925] RSP: 0018:ffffa41cc0cafe90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff4d [ 12.737756] RAX: 0000000000000000 RBX: ffff8c2a761edb80 RCX: 0000000000000000 [ 12.738504] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 12.739258] RBP: ffffa41cc0cafe90 R08: 0000014b5b9795e5 R09: ffffa41cc12c7e88 [ 12.740118] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002 [ 12.740964] R13: ffff8c2a761edb80 R14: 0000000000000000 R15: 0000000000000000 [ 12.741831] default_idle+0x2a/0x100 [ 12.742323] arch_cpu_idle+0xf/0x20 [ 12.742796] default_idle_call+0x28/0x40 [ 12.743312] do_idle+0x179/0x1f0 [ 12.743761] cpu_startup_entry+0x1d/0x20 [ 12.744291] start_secondary+0x112/0x120 [ 12.744816] secondary_startup_64+0xa5/0xa5 [ 12.745367] Code: b9 f4 01 00 00 48 89 c2 48 c1 ea 02 48 3d d3 07 00 00 48 0f 47 d1 49 8b 0c 24 48 39 d1 76 07 49 89 14 24 48 89 d1 31 d2 48 89 df <48> f7 f1 89 c6 e8 81 6e ff ff 5b 41 5c 5d c3 66 90 66 2e 0f 1f [ 12.747527] RIP: tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] RSP: ffff8c2a7fc838a0 [ 12.748555] ---[ end trace 1399ab83390650fd ]--- [ 12.749296] Kernel panic - not syncing: Fatal exception in interrupt [ 12.750123] Kernel Offset: 0x13200000 from 0xffffffff82000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 12.751215] Rebooting in 60 seconds.. Fixes: c9b64d492b1f ("tipc: add replicast peer discovery") Signed-off-by: Tommi Rantala Cc: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ecca64fc6a6f..3deabcab4882 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -371,10 +371,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto rcu_out; } - tipc_rcv(sock_net(sk), skb, b); - rcu_read_unlock(); - return 0; - rcu_out: rcu_read_unlock(); out: -- cgit v1.2.3 From 974a6b20518602310637bd8ac9ad348bf8a864d6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Dec 2017 11:47:56 +0100 Subject: batman-adv: Fix kernel-doc for timer functions The commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()") changed the argument name and type of the timer function but didn't adjust the kernel-doc of these functions. Signed-off-by: Sven Eckelmann Acked-by: Kees Cook Signed-off-by: Simon Wunderlich --- net/batman-adv/tp_meter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 15cd2139381e..ebc4e2241c77 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -482,7 +482,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_sender_timeout - timer that fires in case of packet loss - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars * * If fired it means that there was packet loss. * Switch to Slow Start, set the ss_threshold to half of the current cwnd and @@ -1106,7 +1106,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is * reached without received ack - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { -- cgit v1.2.3 From c501256406fb19c306504ee1fe41a4ea208d4245 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Dec 2017 11:09:53 +0000 Subject: rxrpc: Use correct netns source in rxrpc_release_sock() In rxrpc_release_sock() there may be no rx->local value to access, so we can't unconditionally follow it to the rxrpc network namespace information to poke the connection reapers. Instead, use the socket's namespace pointer to find the namespace. This unfixed code causes the following static checker warning: net/rxrpc/af_rxrpc.c:898 rxrpc_release_sock() error: we previously assumed 'rx->local' could be null (see line 887) Fixes: 3d18cbb7fd0c ("rxrpc: Fix conn expiry timers") Reported-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 8f7cf4c042be..dcd818fa837e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -860,6 +860,7 @@ static void rxrpc_sock_destructor(struct sock *sk) static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); @@ -895,8 +896,8 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - rxrpc_queue_work(&rx->local->rxnet->service_conn_reaper); - rxrpc_queue_work(&rx->local->rxnet->client_conn_reaper); + rxrpc_queue_work(&rxnet->service_conn_reaper); + rxrpc_queue_work(&rxnet->client_conn_reaper); rxrpc_put_local(rx->local); rx->local = NULL; -- cgit v1.2.3 From eeea10b83a139451130df1594f26710c8fa390c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Dec 2017 09:32:59 -0800 Subject: tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() James Morris reported kernel stack corruption bug [1] while running the SELinux testsuite, and bisected to a recent commit bffa72cf7f9d ("net: sk_buff rbnode reorg") We believe this commit is fine, but exposes an older bug. SELinux code runs from tcp_filter() and might send an ICMP, expecting IP options to be found in skb->cb[] using regular IPCB placement. We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very similar way we added them for IPv6. [1] [ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet [ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 [ 339.822505] [ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 [ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 [ 339.885060] Call Trace: [ 339.896875] [ 339.908103] dump_stack+0x63/0x87 [ 339.920645] panic+0xe8/0x248 [ 339.932668] ? ip_push_pending_frames+0x33/0x40 [ 339.946328] ? icmp_send+0x525/0x530 [ 339.958861] ? kfree_skbmem+0x60/0x70 [ 339.971431] __stack_chk_fail+0x1b/0x20 [ 339.984049] icmp_send+0x525/0x530 [ 339.996205] ? netlbl_skbuff_err+0x36/0x40 [ 340.008997] ? selinux_netlbl_err+0x11/0x20 [ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 [ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 [ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 [ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 [ 340.074562] ? tcp_filter+0x2c/0x40 [ 340.086400] ? tcp_v4_rcv+0x820/0xa20 [ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 [ 340.111279] ? ip_local_deliver+0x6f/0xe0 [ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 [ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 [ 340.147442] ? ip_rcv+0x27c/0x3c0 [ 340.158668] ? inet_del_offload+0x40/0x40 [ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 [ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 [ 340.195282] ? __netif_receive_skb+0x18/0x60 [ 340.207288] ? process_backlog+0x95/0x140 [ 340.218948] ? net_rx_action+0x26c/0x3b0 [ 340.230416] ? __do_softirq+0xc9/0x26a [ 340.241625] ? do_softirq_own_stack+0x2a/0x40 [ 340.253368] [ 340.262673] ? do_softirq+0x50/0x60 [ 340.273450] ? __local_bh_enable_ip+0x57/0x60 [ 340.285045] ? ip_finish_output2+0x175/0x350 [ 340.296403] ? ip_finish_output+0x127/0x1d0 [ 340.307665] ? nf_hook_slow+0x3c/0xb0 [ 340.318230] ? ip_output+0x72/0xe0 [ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 [ 340.340070] ? ip_local_out+0x35/0x40 [ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 [ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 [ 340.372484] ? __skb_clone+0x2e/0x130 [ 340.382633] ? tcp_transmit_skb+0x558/0xa10 [ 340.393262] ? tcp_connect+0x938/0xad0 [ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 [ 340.414206] ? tcp_v4_connect+0x457/0x4e0 [ 340.424471] ? __inet_stream_connect+0xb3/0x300 [ 340.435195] ? inet_stream_connect+0x3b/0x60 [ 340.445607] ? SYSC_connect+0xd9/0x110 [ 340.455455] ? __audit_syscall_entry+0xaf/0x100 [ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 [ 340.476636] ? __audit_syscall_exit+0x209/0x290 [ 340.487151] ? SyS_connect+0xe/0x10 [ 340.496453] ? do_syscall_64+0x67/0x1b0 [ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet Reported-by: James Morris Tested-by: James Morris Tested-by: Casey Schaufler Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 59 ++++++++++++++++++++++++++++++++++++----------------- net/ipv6/tcp_ipv6.c | 10 +++++---- 2 files changed, 46 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c6bc0c4d19c6..77ea45da0fe9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1591,6 +1591,34 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_filter); +static void tcp_v4_restore_cb(struct sk_buff *skb) +{ + memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, + sizeof(struct inet_skb_parm)); +} + +static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, + const struct tcphdr *th) +{ + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; +} + /* * From tcp_input.c */ @@ -1631,24 +1659,6 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); - /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() - * barrier() makes sure compiler wont play fool^Waliasing games. - */ - memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), - sizeof(struct inet_skb_parm)); - barrier(); - - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff * 4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->has_rxtstamp = - skb->tstamp || skb_hwtstamps(skb)->hwtstamp; - lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); @@ -1679,14 +1689,19 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); + tcp_v4_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1712,6 +1727,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; @@ -1742,6 +1758,8 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1768,6 +1786,8 @@ do_time_wait: goto discard_it; } + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1784,6 +1804,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + tcp_v4_restore_cb(skb); refcounted = false; goto process; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index be11dc13aa70..1f04ec0e4a7a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1454,7 +1454,6 @@ process: struct sock *nsk; sk = req->rsk_listener; - tcp_v6_fill_cb(skb, hdr, th); if (tcp_v6_inbound_md5_hash(sk, skb)) { sk_drops_add(sk, skb); reqsk_put(req); @@ -1467,8 +1466,12 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; @@ -1492,8 +1495,6 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - tcp_v6_fill_cb(skb, hdr, th); - if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; @@ -1501,6 +1502,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; -- cgit v1.2.3 From 4b380c42f7d00a395feede754f0bc2292eebe6e5 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: netfilter: nfnetlink_cthelper: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 41628b393673..d33ce6d5ebce 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -407,6 +408,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -611,6 +615,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -678,6 +685,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); -- cgit v1.2.3 From 6ab405114b0b229151ef06f4e31c7834dd09d0c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Dec 2017 01:46:07 +0100 Subject: netfilter: xt_bpf: add overflow checks Check whether inputs from userspace are too long (explicit length field too big or string not null-terminated) to avoid out-of-bounds reads. As far as I can tell, this can at worst lead to very limited kernel heap memory disclosure or oopses. This bug can be triggered by an unprivileged user even if the xt_bpf module is not loaded: iptables is available in network namespaces, and the xt_bpf module can be autoloaded. Triggering the bug with a classic BPF filter with fake length 0x1000 causes the following KASAN report: ================================================================== BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0 Read of size 32768 at addr ffff8801eff2c494 by task test/4627 CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1 [...] Call Trace: dump_stack+0x5c/0x85 print_address_description+0x6a/0x260 kasan_report+0x254/0x370 ? bpf_prog_create+0x84/0xf0 memcpy+0x1f/0x50 bpf_prog_create+0x84/0xf0 bpf_mt_check+0x90/0xd6 [xt_bpf] [...] Allocated by task 4627: kasan_kmalloc+0xa0/0xd0 __kmalloc_node+0x47/0x60 xt_alloc_table_info+0x41/0x70 [x_tables] [...] The buggy address belongs to the object at ffff8801eff2c3c0 which belongs to the cache kmalloc-2048 of size 2048 The buggy address is located 212 bytes inside of 2048-byte region [ffff8801eff2c3c0, ffff8801eff2cbc0) [...] ================================================================== Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_bpf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..1f7fbd3c7e5a 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, { struct sock_fprog_kern program; + if (len > XT_BPF_MAX_NUM_INSTR) + return -EINVAL; + program.len = len; program.filter = insns; @@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) mm_segment_t oldfs = get_fs(); int retval, fd; + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) + return -EINVAL; + set_fs(KERNEL_DS); fd = bpf_obj_get_user(path, 0); set_fs(oldfs); -- cgit v1.2.3 From 5ba7dcfe77037b67016263ea597a8b431692ecab Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Dec 2017 11:26:45 +0100 Subject: batman-adv: Fix lock for ogm cnt access in batadv_iv_ogm_calc_tq The originator node object orig_neigh_node is used to when accessing the bcast_own(_sum) and real_packet_count information. The access to them has to be protected with the spinlock in orig_neigh_node. But the function uses the lock in orig_node instead. This is incorrect because they could be two different originator node objects. Fixes: 0ede9f41b217 ("batman-adv: protect bit operations to count OGMs with spinlock") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 1b659ab652fb..bbe8414b6ee7 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1214,7 +1214,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1224,7 +1224,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) -- cgit v1.2.3 From e599ea1410c3a2f55716f9c309587235cca32025 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Dec 2017 15:28:44 -0800 Subject: Revert "tcp: must block bh in __inet_twsk_hashdance()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We had to disable BH _before_ calling __inet_twsk_hashdance() in commit cfac7f836a71 ("tcp/dccp: block bh before arming time_wait timer"). This means we can revert 614bdd4d6e61 ("tcp: must block bh in __inet_twsk_hashdance()"). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c690cd0d9b3f..b563e0c46bac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -93,7 +93,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. + * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock_bh(&bhead->lock); + spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -137,7 +137,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock_bh(lock); + spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -- cgit v1.2.3 From 029b6d1405504984b9d2661110ff1a17467d3426 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 2 Dec 2017 08:41:55 +0100 Subject: Revert "net: core: maybe return -EEXIST in __dev_alloc_name" This reverts commit d6f295e9def0; some userspace (in the case we noticed it's wpa_supplicant), is relying on the current error code to determine that a fixed name interface already exists. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 07ed21d64f92..f47e96b62308 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,7 +1106,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ - return p ? -ENFILE : -EEXIST; + return -ENFILE; } static int dev_alloc_name_ns(struct net *net, -- cgit v1.2.3 From 8afa10cbe281b10371fee5a87ab266e48d71a7f9 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 4 Dec 2017 13:31:11 +0200 Subject: net_sched: red: Avoid illegal values Check the qmin & qmax values doesn't overflow for the given Wlog value. Check that qmin <= qmax. Fixes: a783474591f2 ("[PKT_SCHED]: Generic RED layer") Signed-off-by: Nogah Frankel Signed-off-by: David S. Miller --- include/net/red.h | 11 +++++++++++ net/sched/sch_choke.c | 3 +++ net/sched/sch_gred.c | 3 +++ net/sched/sch_red.c | 2 ++ net/sched/sch_sfq.c | 3 +++ 5 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/net/red.h b/include/net/red.h index 5918f78d36a0..9665582c4687 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -168,6 +168,17 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog) +{ + if (fls(qth_min) + Wlog > 32) + return false; + if (fls(qth_max) + Wlog > 32) + return false; + if (qth_max < qth_min) + return false; + return true; +} + static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index b30a2c70bd48..531250fceb9e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -369,6 +369,9 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ctl = nla_data(tb[TCA_CHOKE_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (ctl->limit > CHOKE_MAX_QUEUE) return -EINVAL; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 17c7130454bd..bc30f9186ac6 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -356,6 +356,9 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7f8ea9e297c3..9d874e60e032 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -212,6 +212,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; if (ctl->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 09c1203c1711..930e5bd26d3d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -639,6 +639,9 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, + ctl_v1->Wlog)) + return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) -- cgit v1.2.3 From 672ecbe1c977616aa720c9397589665b33e72610 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 4 Dec 2017 10:31:43 -0800 Subject: tipc: fix a null pointer deref on error path In tipc_topsrv_kern_subscr() when s->tipc_conn_new() fails we call tipc_close_conn() to clean up, but in this case calling conn_put() is just enough. This fixes the folllowing crash: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 3085 Comm: syzkaller064164 Not tainted 4.15.0-rc1+ #137 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: 00000000c24413a5 task.stack: 000000005e8160b5 RIP: 0010:__lock_acquire+0xd55/0x47f0 kernel/locking/lockdep.c:3378 RSP: 0018:ffff8801cb5474a8 EFLAGS: 00010002 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffff85ecb400 RBP: ffff8801cb547830 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff87489d60 R12: ffff8801cd2980c0 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000020 FS: 00000000014ee880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffee2426e40 CR3: 00000001cb85a000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:4004 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:320 [inline] tipc_subscrb_subscrp_delete+0x8f/0x470 net/tipc/subscr.c:201 tipc_subscrb_delete net/tipc/subscr.c:238 [inline] tipc_subscrb_release_cb+0x17/0x30 net/tipc/subscr.c:316 tipc_close_conn+0x171/0x270 net/tipc/server.c:204 tipc_topsrv_kern_subscr+0x724/0x810 net/tipc/server.c:514 tipc_group_create+0x702/0x9c0 net/tipc/group.c:184 tipc_sk_join net/tipc/socket.c:2747 [inline] tipc_setsockopt+0x249/0xc10 net/tipc/socket.c:2861 SYSC_setsockopt net/socket.c:1851 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1830 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 14c04493cb77 ("tipc: add ability to order and receive topology events in driver") Reported-by: syzbot Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index acaef80fb88c..2710101ba4c1 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -511,7 +511,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, s = con->server; scbr = s->tipc_conn_new(*conid); if (!scbr) { - tipc_close_conn(con); + conn_put(con); return false; } -- cgit v1.2.3 From a7d5f107b4978e08eeab599ee7449af34d034053 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 4 Dec 2017 22:00:20 +0100 Subject: tipc: fix memory leak in tipc_accept_from_sock() When the function tipc_accept_from_sock() fails to create an instance of struct tipc_subscriber it omits to free the already created instance of struct tipc_conn instance before it returns. We fix that with this commit. Reported-by: David S. Miller Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 2710101ba4c1..d60c30342327 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -314,6 +314,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->usr_data = s->tipc_conn_new(newcon->conid); if (!newcon->usr_data) { sock_release(newsock); + conn_put(newcon); return -ENOMEM; } -- cgit v1.2.3 From c9d3fe9da094a9a7a3d3cd365b334b822e05f5e8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 5 Dec 2017 11:31:14 +0000 Subject: VSOCK: fix outdated sk_state value in hvs_release() Since commit 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 ("VSOCK: use TCP state constants for sk_state") VSOCK has used TCP_* constants for sk_state. Commit b4562ca7925a3bedada87a3dd072dd5bad043288 ("hv_sock: add locking in the open/close/release code paths") reintroduced the SS_DISCONNECTING constant. This patch replaces the old SS_DISCONNECTING with the new TCP_CLOSING constant. CC: Dexuan Cui CC: Cathy Avery Signed-off-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/hyperv_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 5583df708b8c..a827547aa102 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -487,7 +487,7 @@ static void hvs_release(struct vsock_sock *vsk) lock_sock(sk); - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; vsock_remove_sock(vsk); release_sock(sk); -- cgit v1.2.3 From 69c64866ce072dea1d1e59a0d61e0f66c0dffb76 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Tue, 5 Dec 2017 20:58:35 +0000 Subject: dccp: CVE-2017-8824: use-after-free in DCCP code Whenever the sock object is in DCCP_CLOSED state, dccp_disconnect() must free dccps_hc_tx_ccid and dccps_hc_rx_ccid and set to NULL. Signed-off-by: Mohamed Ghannam Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/proto.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b68168fcc06a..9d43c1f40274 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -259,6 +259,7 @@ int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); int err = 0; const int old_state = sk->sk_state; @@ -278,6 +279,10 @@ int dccp_disconnect(struct sock *sk, int flags) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = NULL; + dp->dccps_hc_tx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); -- cgit v1.2.3 From a5739435b5a3b8c449f8844ecd71a3b1e89f0a33 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:27:57 +0000 Subject: fix kcm_clone() 1) it's fput() or sock_release(), not both 2) don't do fd_install() until the last failure exit. 3) not a bug per se, but... don't attach socket to struct file until it's set up. Take reserving descriptor into the caller, move fd_install() to the caller, sanitize failure exits and calling conventions. Cc: stable@vger.kernel.org # v4.6+ Acked-by: Tom Herbert Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 71 +++++++++++++++++++++---------------------------------- 1 file changed, 27 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 0b750a22c4b9..c5fa634e63ca 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1625,60 +1625,35 @@ static struct proto kcm_proto = { }; /* Clone a kcm socket. */ -static int kcm_clone(struct socket *osock, struct kcm_clone *info, - struct socket **newsockp) +static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *newfile; - int err, newfd; + struct file *file; - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(0); - if (unlikely(newfd < 0)) { - err = newfd; - goto out_fd_fail; - } - - newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - goto out_sock_alloc_fail; - } - newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, true); if (!newsk) { - err = -ENOMEM; - goto out_sk_alloc_fail; + sock_release(newsock); + return ERR_PTR(-ENOMEM); } - sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - fd_install(newfd, newfile); - *newsockp = newsock; - info->fd = newfd; - - return 0; + file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); + if (IS_ERR(file)) + sock_release(newsock); -out_sk_alloc_fail: - fput(newfile); -out_sock_alloc_fail: - put_unused_fd(newfd); -out_fd_fail: - sock_release(newsock); -out: - return err; + return file; } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -1708,17 +1683,25 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct socket *newsock = NULL; - - err = kcm_clone(sock, &info, &newsock); - if (!err) { - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - err = -EFAULT; - sys_close(info.fd); - } - } + struct file *file; + + info.fd = get_unused_fd_flags(0); + if (unlikely(info.fd < 0)) + return info.fd; + file = kcm_clone(sock); + if (IS_ERR(file)) { + put_unused_fd(info.fd); + return PTR_ERR(file); + } + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + put_unused_fd(info.fd); + fput(file); + return -EFAULT; + } + fd_install(info.fd, file); + err = 0; break; } default: -- cgit v1.2.3 From 016a266bdfeda268afb2228b6217fd4771334635 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:28:38 +0000 Subject: socketpair(): allocate descriptors first simplifies failure exits considerably... Reviewed-by: Eric Dumazet Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/socket.c | 89 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 38 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..2df83c0bfde9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1365,88 +1365,75 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + /* + * reserve descriptors and make sure we won't fail + * to return them to userland. + */ + fd1 = get_unused_fd_flags(flags); + if (unlikely(fd1 < 0)) + return fd1; + + fd2 = get_unused_fd_flags(flags); + if (unlikely(fd2 < 0)) { + put_unused_fd(fd1); + return fd2; + } + + err = put_user(fd1, &usockvec[0]); + if (err) + goto out; + + err = put_user(fd2, &usockvec[1]); + if (err) + goto out; + /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); - if (err < 0) + if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); - if (err < 0) - goto out_release_1; - - err = sock1->ops->socketpair(sock1, sock2); - if (err < 0) - goto out_release_both; - - fd1 = get_unused_fd_flags(flags); - if (unlikely(fd1 < 0)) { - err = fd1; - goto out_release_both; + if (unlikely(err < 0)) { + sock_release(sock1); + goto out; } - fd2 = get_unused_fd_flags(flags); - if (unlikely(fd2 < 0)) { - err = fd2; - goto out_put_unused_1; + err = sock1->ops->socketpair(sock1, sock2); + if (unlikely(err < 0)) { + sock_release(sock2); + sock_release(sock1); + goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); - goto out_put_unused_both; + sock_release(sock1); + sock_release(sock2); + goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - goto out_fput_1; + sock_release(sock2); + fput(newfile1); + goto out; } - err = put_user(fd1, &usockvec[0]); - if (err) - goto out_fput_both; - - err = put_user(fd2, &usockvec[1]); - if (err) - goto out_fput_both; - audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); - /* fd1 and fd2 may be already another descriptors. - * Not kernel problem. - */ - return 0; -out_fput_both: - fput(newfile2); - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - goto out; - -out_fput_1: - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - sock_release(sock2); - goto out; - -out_put_unused_both: +out: put_unused_fd(fd2); -out_put_unused_1: put_unused_fd(fd1); -out_release_both: - sock_release(sock2); -out_release_1: - sock_release(sock1); -out: return err; } -- cgit v1.2.3 From 8e1611e2357927b22892ecc062d65c99d0d89066 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:29:09 +0000 Subject: make sock_alloc_file() do sock_release() on failures This changes calling conventions (and simplifies the hell out the callers). New rules: once struct socket had been passed to sock_alloc_file(), it's been consumed either by struct file or by sock_release() done by sock_alloc_file(). Either way the caller should not do sock_release() after that point. Reviewed-by: Eric Dumazet Signed-off-by: Al Viro Signed-off-by: David S. Miller --- drivers/staging/lustre/lnet/lnet/lib-socket.c | 8 ++------ net/9p/trans_fd.c | 1 - net/kcm/kcmsock.c | 7 +------ net/sctp/socket.c | 1 - net/socket.c | 25 ++++++++----------------- 5 files changed, 11 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index 539a26444f31..7d49d4865298 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -71,16 +71,12 @@ lnet_sock_ioctl(int cmd, unsigned long arg) } sock_filp = sock_alloc_file(sock, 0, NULL); - if (IS_ERR(sock_filp)) { - sock_release(sock); - rc = PTR_ERR(sock_filp); - goto out; - } + if (IS_ERR(sock_filp)) + return PTR_ERR(sock_filp); rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg); fput(sock_filp); -out: return rc; } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 985046ae4231..80f5c79053a4 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -839,7 +839,6 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", __func__, task_pid_nr(current)); - sock_release(csocket); kfree(p); return PTR_ERR(file); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index c5fa634e63ca..d4e98f20fc2a 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1629,7 +1629,6 @@ static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *file; newsock = sock_alloc(); if (!newsock) @@ -1649,11 +1648,7 @@ static struct file *kcm_clone(struct socket *osock) sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (IS_ERR(file)) - sock_release(newsock); - - return file; + return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 014847e25648..eb17a911aa29 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5080,7 +5080,6 @@ static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *p *newfile = sock_alloc_file(newsock, 0, NULL); if (IS_ERR(*newfile)) { put_unused_fd(retval); - sock_release(newsock); retval = PTR_ERR(*newfile); *newfile = NULL; return retval; diff --git a/net/socket.c b/net/socket.c index 2df83c0bfde9..05f361faec45 100644 --- a/net/socket.c +++ b/net/socket.c @@ -406,8 +406,10 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) name.len = strlen(name.name); } path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); - if (unlikely(!path.dentry)) + if (unlikely(!path.dentry)) { + sock_release(sock); return ERR_PTR(-ENOMEM); + } path.mnt = mntget(sock_mnt); d_instantiate(path.dentry, SOCK_INODE(sock)); @@ -415,9 +417,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); if (IS_ERR(file)) { - /* drop dentry, keep inode */ + /* drop dentry, keep inode for a bit */ ihold(d_inode(path.dentry)); path_put(&path); + /* ... and now kill it properly */ + sock_release(sock); return file; } @@ -1330,19 +1334,9 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) retval = sock_create(family, type, protocol, &sock); if (retval < 0) - goto out; - - retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); - if (retval < 0) - goto out_release; - -out: - /* It may be already another descriptor 8) Not kernel problem. */ - return retval; + return retval; -out_release: - sock_release(sock); - return retval; + return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } /* @@ -1412,7 +1406,6 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); - sock_release(sock1); sock_release(sock2); goto out; } @@ -1420,7 +1413,6 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - sock_release(sock2); fput(newfile1); goto out; } @@ -1549,7 +1541,6 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); - sock_release(newsock); goto out_put; } -- cgit v1.2.3 From 71334963d01ed7ec61a958a5a6585172793f5a24 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:27:59 +0100 Subject: wireless: replace usage of hexdump with od/sed Since od/sed are in posix, hopefully there's a better chance people will have them, over hexdump. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 278d979c211a..63cbb6432b2d 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -27,7 +27,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done + @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ @@ -36,6 +36,6 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done + @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ -- cgit v1.2.3 From 715a12334764657bafb3ab964fb25f4e6115c770 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:59:33 +0100 Subject: wireless: don't write C files on failures Change the scripting inside the shipped/extra certs C code generation to not write the file when there are any failures. That way, if the build aborts due to failures, we don't get into a situation where a dummy file has been created and the next build succeeds, but not with the desired output. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 63cbb6432b2d..d7d6cb00c47b 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -25,17 +25,45 @@ endif $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) >> $@) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 extra_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ + ) >> $@) -- cgit v1.2.3 From 916a27901de01446bcf57ecca4783f6cff493309 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: netfilter: xt_osf: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 36e14b1f061d..a34f314a8c23 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -115,6 +119,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; -- cgit v1.2.3 From 32d3e51a82d453762ef148b2c4fbc8a7ec374a88 Mon Sep 17 00:00:00 2001 From: Chris Dion Date: Wed, 6 Dec 2017 10:50:28 -0500 Subject: net_sched: use macvlan real dev trans_start in dev_trans_start() Macvlan devices are similar to vlans and do not update their own trans_start. In order for arp monitoring to work for a bond device when the slaves are macvlans, obtain its real device. Signed-off-by: Chris Dion Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3839cbbdc32b..cd1b200acae7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -277,6 +278,8 @@ unsigned long dev_trans_start(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); + else if (netif_is_macvlan(dev)) + dev = macvlan_dev_real_dev(dev); res = netdev_get_tx_queue(dev, 0)->trans_start; for (i = 1; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; -- cgit v1.2.3 From f3069c6d33f6ae63a1668737bc78aaaa51bff7ca Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Wed, 6 Dec 2017 17:18:28 +0100 Subject: rds: Fix NULL pointer dereference in __rds_rdma_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for syzkaller719569, where memory registration was attempted without any underlying transport being loaded. Analysis of the case reveals that it is the setsockopt() RDS_GET_MR (2) and RDS_GET_MR_FOR_DEST (7) that are vulnerable. Here is an example stack trace when the bug is hit: BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0 IP: __rds_rdma_map+0x36/0x440 [rds] PGD 2f93d03067 P4D 2f93d03067 PUD 2f93d02067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: bridge stp llc tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache rds binfmt_misc sb_edac intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul c rc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd iTCO_wdt mei_me sg iTCO_vendor_support ipmi_si mei ipmi_devintf nfsd shpchp pcspkr i2c_i801 ioatd ma ipmi_msghandler wmi lpc_ich mfd_core auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 mgag200 i2c_algo_bit drm_kms_helper ixgbe syscopyarea ahci sysfillrect sysimgblt libahci mdio fb_sys_fops ttm ptp libata sd_mod mlx4_core drm crc32c_intel pps_core megaraid_sas i2c_core dca dm_mirror dm_region_hash dm_log dm_mod CPU: 48 PID: 45787 Comm: repro_set2 Not tainted 4.14.2-3.el7uek.x86_64 #2 Hardware name: Oracle Corporation ORACLE SERVER X5-2L/ASM,MOBO TRAY,2U, BIOS 31110000 03/03/2017 task: ffff882f9190db00 task.stack: ffffc9002b994000 RIP: 0010:__rds_rdma_map+0x36/0x440 [rds] RSP: 0018:ffffc9002b997df0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff882fa2182580 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc9002b997e40 RDI: ffff882fa2182580 RBP: ffffc9002b997e30 R08: 0000000000000000 R09: 0000000000000002 R10: ffff885fb29e3838 R11: 0000000000000000 R12: ffff882fa2182580 R13: ffff882fa2182580 R14: 0000000000000002 R15: 0000000020000ffc FS: 00007fbffa20b700(0000) GS:ffff882fbfb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000c0 CR3: 0000002f98a66006 CR4: 00000000001606e0 Call Trace: rds_get_mr+0x56/0x80 [rds] rds_setsockopt+0x172/0x340 [rds] ? __fget_light+0x25/0x60 ? __fdget+0x13/0x20 SyS_setsockopt+0x80/0xe0 do_syscall_64+0x67/0x1b0 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fbff9b117f9 RSP: 002b:00007fbffa20aed8 EFLAGS: 00000293 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000000c84a4 RCX: 00007fbff9b117f9 RDX: 0000000000000002 RSI: 0000400000000114 RDI: 000000000000109b RBP: 00007fbffa20af10 R08: 0000000000000020 R09: 00007fbff9dd7860 R10: 0000000020000ffc R11: 0000000000000293 R12: 0000000000000000 R13: 00007fbffa20b9c0 R14: 00007fbffa20b700 R15: 0000000000000021 Code: 41 56 41 55 49 89 fd 41 54 53 48 83 ec 18 8b 87 f0 02 00 00 48 89 55 d0 48 89 4d c8 85 c0 0f 84 2d 03 00 00 48 8b 87 00 03 00 00 <48> 83 b8 c0 00 00 00 00 0f 84 25 03 00 0 0 48 8b 06 48 8b 56 08 The fix is to check the existence of an underlying transport in __rds_rdma_map(). Signed-off-by: Håkon Bugge Reported-by: syzbot Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8886f15abe90..bc2f1e0977d6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0) { + if (rs->rs_bound_addr == 0 || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } -- cgit v1.2.3 From 74c4b656c3d92ec4c824ea1a4afd726b7b6568c8 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Wed, 6 Dec 2017 17:15:43 -0800 Subject: adding missing rcu_read_unlock in ipxip6_rcv commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") introduced new exit point in ipxip6_rcv. however rcu_read_unlock is missing there. this diff is fixing this v1->v2: instead of doing rcu_read_unlock in place, we are going to "drop" section (to prevent skb leakage) Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Signed-off-by: Nikita V. Shirokov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3d3092adf1d2..db84f523656d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -904,7 +904,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (t->parms.collect_md) { tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); if (!tun_dst) - return 0; + goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); -- cgit v1.2.3 From 8632385022f2b05a6ca0b9e0f95575865de0e2ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Dec 2017 11:08:19 -0800 Subject: tcp: use current time in tcp_rcv_space_adjust() When I switched rcv_rtt_est to high resolution timestamps, I forgot that tp->tcp_mstamp needed to be refreshed in tcp_rcv_space_adjust() Using an old timestamp leads to autotuning lags. Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 734cfc8ff76e..514c00732988 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,6 +579,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; -- cgit v1.2.3 From 96307a0a75d8f1847debefd6a402339aac43e224 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Dec 2017 14:26:09 +0100 Subject: netfilter: ipt_CLUSTERIP: fix clusterip_net_exit build regression The added check produces a build error when CONFIG_PROC_FS is disabled: net/ipv4/netfilter/ipt_CLUSTERIP.c: In function 'clusterip_net_exit': net/ipv4/netfilter/ipt_CLUSTERIP.c:822:28: error: 'cn' undeclared (first use in this function) This moves the variable declaration out of the #ifdef to make it available to the WARN_ON_ONCE(). Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added") Signed-off-by: Arnd Bergmann Reviewed-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e35b8d074f06..69060e3abe85 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -813,8 +813,8 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { -#ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); +#ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; #endif -- cgit v1.2.3 From 75bf50f4aaa1c78d769d854ab3d975884909e4fb Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Thu, 7 Dec 2017 21:54:27 +0100 Subject: xfrm: fix xfrm_do_migrate() with AEAD e.g(AES-GCM) copy geniv when cloning the xfrm state. x->geniv was not copied to the new state and migration would fail. xfrm_do_migrate .. xfrm_state_clone() .. .. esp_init_aead() crypto_alloc_aead() crypto_alloc_tfm() crypto_find_alg() return EAGAIN and failed Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1f5cee2269af..88d0a563e141 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1344,6 +1344,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); + x->geniv = orig->geniv; if (!x->aead) goto error; } -- cgit v1.2.3 From 732706afe1cc46ef48493b3d2b69c98f36314ae4 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 8 Dec 2017 08:07:25 +0100 Subject: xfrm: Fix stack-out-of-bounds with misconfigured transport mode policies. On policies with a transport mode template, we pass the addresses from the flowi to xfrm_state_find(), assuming that the IP addresses (and address family) don't change during transformation. Unfortunately our policy template validation is not strict enough. It is possible to configure policies with transport mode template where the address family of the template does not match the selectors address family. This lead to stack-out-of-bound reads because we compare arddesses of the wrong family. Fix this by refusing such a configuration, address family can not change on transport mode. We use the assumption that, on transport mode, the first templates address family must match the address family of the policy selector. Subsequent transport mode templates must mach the address family of the previous template. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ff58c37469d6..bdb48e5dba04 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1419,11 +1419,14 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) { + u16 prev_family; int i; if (nr > XFRM_MAX_DEPTH) return -EINVAL; + prev_family = family; + for (i = 0; i < nr; i++) { /* We never validated the ut->family value, so many * applications simply leave it at zero. The check was @@ -1435,6 +1438,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) if (!ut[i].family) ut[i].family = family; + if ((ut[i].mode == XFRM_MODE_TRANSPORT) && + (ut[i].family != prev_family)) + return -EINVAL; + + prev_family = ut[i].family; + switch (ut[i].family) { case AF_INET: break; -- cgit v1.2.3 From d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 7 Dec 2017 13:41:34 -0800 Subject: tcp: invalidate rate samples during SACK reneging Mark tcp_sock during a SACK reneging event and invalidate rate samples while marked. Such rate samples may overestimate bw by including packets that were SACKed before reneging. < ack 6001 win 10000 sack 7001:38001 < ack 7001 win 0 sack 8001:38001 // Reneg detected > seq 7001:8001 // RTO, SACK cleared. < ack 38001 win 10000 In above example the rate sample taken after the last ack will count 7001-38001 as delivered while the actual delivery rate likely could be much lower i.e. 7001-8001. This patch adds a new field tcp_sock.sack_reneg and marks it when we declare SACK reneging and entering TCP_CA_Loss, and unmarks it after the last rate sample was taken before moving back to TCP_CA_Open. This patch also invalidates rate samples taken while tcp_sock.is_sack_reneg is set. Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection") Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Eric Dumazet Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 10 ++++++++-- net/ipv4/tcp_rate.c | 10 +++++++--- 5 files changed, 19 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index df5d97a85e1a..ca4a6361389b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -224,7 +224,8 @@ struct tcp_sock { rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ - unused:3; + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + unused:2; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ unused1 : 1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 6998707e81f3..6da880d2f022 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1055,7 +1055,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs); + bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf97317e6c97..f08eebe60446 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2412,6 +2412,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 514c00732988..075c559570e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1942,6 +1942,8 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; } tcp_clear_all_retrans_hints(tp); @@ -2365,6 +2367,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) return true; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; return false; } @@ -2398,8 +2401,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - if (frto_undo || tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + } return true; } return false; @@ -3496,6 +3501,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; + bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; @@ -3612,7 +3618,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, sack_state.rate); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 3330a370d306..c61240e43923 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs) + bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp) { + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; -- cgit v1.2.3 From c589e69b508d29ed8e644dfecda453f71c02ec27 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:30 -0500 Subject: tcp_bbr: record "full bw reached" decision in new full_bw_reached bit This commit records the "full bw reached" decision in a new full_bw_reached bit. This is a pure refactor that does not change the current behavior, but enables subsequent fixes and improvements. In particular, this enables simple and clean fixes because the full_bw and full_bw_cnt can be unconditionally zeroed without worrying about forgetting that we estimated we filled the pipe in Startup. And it enables future improvements because multiple code paths can be used for estimating that we filled the pipe in Startup; any new code paths only need to set this bit when they think the pipe is full. Note that this fix intentionally reduces the width of the full_bw_cnt counter, since we have never used the most significant bit. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 69ee877574d0..3089c956b9f9 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -110,7 +110,8 @@ struct bbr { u32 lt_last_lost; /* LT intvl start: tp->lost */ u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_cnt:3, /* number of rounds without large bw gains */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ unused_b:5; @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk) { const struct bbr *bbr = inet_csk_ca(sk); - return bbr->full_bw_cnt >= bbr_full_bw_cnt; + return bbr->full_bw_reached; } /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk, return; } ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; } /* If pipe is probably full, drain the queue and then enter steady-state. */ @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk) bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; + bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; -- cgit v1.2.3 From 2f6c498e4f15d27852c04ed46d804a39137ba364 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:31 -0500 Subject: tcp_bbr: reset full pipe detection on loss recovery undo Fix BBR so that upon notification of a loss recovery undo BBR resets the full pipe detection (STARTUP exit) state machine. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this could previously cause BBR to spuriously estimate that the pipe is full. Since spurious loss recovery means that our overall sending will have slowed down spuriously, this commit gives a flow more time to probe robustly for bandwidth and decide the pipe is really full. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 3089c956b9f9..ab3ff14ea7f7 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -874,6 +874,10 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From 600647d467c6d04b3954b41a6ee1795b5ae00550 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:32 -0500 Subject: tcp_bbr: reset long-term bandwidth sampling on loss recovery undo Fix BBR so that upon notification of a loss recovery undo BBR resets long-term bandwidth sampling. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this can cause BBR to spuriously estimate that we are seeing loss rates high enough to trigger long-term bandwidth estimation. To avoid that problem, this commit resets long-term bandwidth sampling on loss recovery undo events. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index ab3ff14ea7f7..8322f26e770e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -878,6 +878,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From 0ce294d88457bccd7f9991f883fec80022a1ddbd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:30 -0800 Subject: tcp: correctly test congestion state in RACK RACK does not test the loss recovery state correctly to compute the reordering window. It assumes if lost_out is zero then TCP is not in loss recovery. But it can be zero during recovery before calling tcp_rack_detect_loss(): when an ACK acknowledges all packets marked lost before receiving this ACK, but has not yet to discover new ones by tcp_rack_detect_loss(). The fix is to simply test the congestion state directly. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d3ea89020c69..3143664902e9 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -55,7 +55,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && + min_rtt != ~0U) { reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); reo_wnd = min(reo_wnd, tp->srtt_us >> 3); } -- cgit v1.2.3 From cd1fc85b4399d47e3d6626301741ba8c38cd475a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:31 -0800 Subject: tcp: always evaluate losses in RACK upon undo When sender detects spurious retransmission, all packets marked lost are remarked to be in-flight. However some may be considered lost based on its timestamps in RACK. This patch forces RACK to re-evaluate, which may be skipped previously if the ACK does not advance RACK timestamp. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 075c559570e6..9550cc42de2d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2329,6 +2329,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; + tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) -- cgit v1.2.3 From 428aec5e69fa17d223e1495f395833c50770f7ae Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:32 -0800 Subject: tcp: fix off-by-one bug in RACK RACK should mark a packet lost when remaining wait time is zero. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 3143664902e9..0c182303e62e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -80,12 +80,12 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) */ remaining = tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); - if (remaining < 0) { + if (remaining <= 0) { tcp_rack_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { - /* Record maximum wait time (+1 to avoid 0) */ - *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + /* Record maximum wait time */ + *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } -- cgit v1.2.3 From 6065fd0d179b96ddc488c76542349bcb148a95fd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:33 -0800 Subject: tcp: evaluate packet losses upon RTT change RACK skips an ACK unless it advances the most recently delivered TX timestamp (rack.mstamp). Since RACK also uses the most recent RTT to decide if a packet is lost, RACK should still run the loss detection whenever the most recent RTT changes. For example, an ACK that does not advance the timestamp but triggers the cwnd undo due to reordering, would then use the most recent (higher) RTT measurement to detect further losses. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 0c182303e62e..3a81720ac0c4 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -117,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, { u32 rtt_us; - if (tp->rack.mstamp && - !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) - return; - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (sacked & TCPCB_RETRANS) { + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -134,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - if (rtt_us < tcp_min_rtt(tp)) - return; + return; } - tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } } /* We have waited long enough to accommodate reordering. Mark the expired -- cgit v1.2.3 From 0afe9d4ab9d40c281bdcdd118661fe8e4bdcef18 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 9 Dec 2017 21:10:10 +0100 Subject: mac80211: fix locking in ieee80211_sta_tear_down_BA_sessions Due to overlap between commit 1281103770e9 ("mac80211: Simplify locking in ieee80211_sta_tear_down_BA_sessions()") and the way that Luca modified commit 72e2c3438ba3 ("mac80211: tear down RX aggregations first") when sending it upstream from Intel's internal tree, we get the following warning: WARNING: CPU: 0 PID: 5472 at net/mac80211/agg-tx.c:315 ___ieee80211_stop_tx_ba_session+0x158/0x1f0 since there's no appropriate locking around the call to ___ieee80211_stop_tx_ba_session; Sara's original just had a call to the locked __ieee80211_stop_tx_ba_session (one less underscore) but it looks like Luca modified both of the calls when fixing it up for upstream, leading to the problem at hand. Move the locking appropriately to fix this problem. Reported-by: Kalle Valo Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 167f83b853e6..1621b6ab17ba 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -291,16 +291,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, int i; mutex_lock(&sta->ampdu_mlme.mtx); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); - } - mutex_unlock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_tx_ba_session(sta, i, reason); + mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); -- cgit v1.2.3 From 4564b187c16327045d87596e8980c65ba7b84c50 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Dec 2017 12:33:47 +0100 Subject: nl80211: fix nl80211_send_iface() error paths Evidently I introduced a locking bug in my change here, the nla_put_failure sometimes needs to unlock. Fix it. Fixes: 44905265bc15 ("nl80211: don't expose wdev->ssid for most interfaces") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1ac23ca20c8..213d0c498c97 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2610,7 +2610,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag case NL80211_IFTYPE_AP: if (wdev->ssid_len && nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) - goto nla_put_failure; + goto nla_put_failure_locked; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: @@ -2623,7 +2623,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (!ssid_ie) break; if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure; + goto nla_put_failure_locked; break; } default: @@ -2635,6 +2635,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag genlmsg_end(msg, hdr); return 0; + nla_put_failure_locked: + wdev_unlock(wdev); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; -- cgit v1.2.3 From f5b5702ac55b11113a94d6228d191c7f827b7a3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Dec 2017 10:14:27 +0100 Subject: netfilter: exthdr: add missign attributes to policy Add missing netlink attribute policy. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a0a93d987a3b..47ec1046ad11 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -214,6 +214,8 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 23715275e4fb6f64358a499d20928a9e93819f2f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 11 Dec 2017 18:19:33 +0300 Subject: netfilter: ip6t_MASQUERADE: add dependency on conntrack module After commit 4d3a57f23dec ("netfilter: conntrack: do not enable connection tracking unless needed") conntrack is disabled by default unless some module explicitly declares dependency in particular network namespace. Fixes: a357b3f80bc8 ("netfilter: nat: add dependencies on conntrack module") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_MASQUERADE.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 2b1a15846f9a..92c0047e7e33 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -33,13 +33,19 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg6_destroy, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", -- cgit v1.2.3 From 93c647643b48f0131f02e45da3bd367d80443291 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Wed, 6 Dec 2017 12:12:27 -0800 Subject: netlink: Add netns check on taps Currently, a nlmon link inside a child namespace can observe systemwide netlink activity. Filter the traffic so that nlmon can only sniff netlink messages from its own netns. Test case: vpnns -- bash -c "ip link add nlmon0 type nlmon; \ ip link set nlmon0 up; \ tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" & sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \ spi 0x1 mode transport \ auth sha1 0x6162633132330000000000000000000000000000 \ enc aes 0x00000000000000000000000000000000 grep --binary abc123 /tmp/nlmon.pcap Signed-off-by: Kevin Cernekee Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b9e0ee4e22f5..79cc1bf36e4a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,6 +253,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct sock *sk = skb->sk; int ret = -ENOMEM; + if (!net_eq(dev_net(dev), sock_net(sk))) + return 0; + dev_hold(dev); if (is_vmalloc_addr(skb->head)) -- cgit v1.2.3 From 8f659a03a0ba9289b9aeb9b4470e6fb263d6f483 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Sun, 10 Dec 2017 03:50:58 +0000 Subject: net: ipv4: fix for a race condition in raw_sendmsg inet->hdrincl is racy, and could lead to uninitialized stack pointer usage, so its value should be read only once. Fixes: c008ba5bdc9f ("ipv4: Avoid reading user iov twice after raw_probe_proto_opt") Signed-off-by: Mohamed Ghannam Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..125c1eab3eaa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields + */ + hdrincl = inet->hdrincl; /* * Check the flags. */ @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); -- cgit v1.2.3 From 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 15:40:51 +0800 Subject: sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams Now in sctp_setsockopt_reset_streams, it only does the check optlen < sizeof(*params) for optlen. But it's not enough, as params->srs_number_streams should also match optlen. If the streams in params->srs_stream_list are less than stream nums in params->srs_number_streams, later when dereferencing the stream list, it could cause a slab-out-of-bounds crash, as reported by syzbot. This patch is to fix it by also checking the stream numbers in sctp_setsockopt_reset_streams to make sure at least it's not greater than the streams in the list. Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter") Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index eb17a911aa29..3253f724a995 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3891,13 +3891,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(struct sctp_reset_streams)) + if (optlen < sizeof(*params)) return -EINVAL; params = memdup_user(optval, optlen); if (IS_ERR(params)) return PTR_ERR(params); + if (params->srs_number_streams * sizeof(__u16) > + optlen - sizeof(*params)) + goto out; + asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) goto out; -- cgit v1.2.3 From d2950278d2d04ff5314abeb38d9c59c4e7c0ee53 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Dec 2017 18:23:09 +0100 Subject: xfrm: put policies when reusing pcpu xdst entry We need to put the policies when re-using the pcpu xdst entry, else this leaks the reference. Fixes: ec30d78c14a813db39a647b6a348b428 ("xfrm: add xdst pcpu cache") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 038ec68f6901..70aa5cb0c659 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1839,6 +1839,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, sizeof(struct xfrm_policy *) * num_pols) == 0 && xfrm_xdst_can_reuse(xdst, xfrm, err)) { dst_hold(&xdst->u.dst); + xfrm_pols_put(pols, num_pols); while (err > 0) xfrm_state_put(xfrm[--err]); return xdst; -- cgit v1.2.3 From 30791ac41927ebd3e75486f9504b6d2280463bf0 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 11 Dec 2017 00:05:46 -0800 Subject: tcp md5sig: Use skb's saddr when replying to an incoming segment The MD5-key that belongs to a connection is identified by the peer's IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying to an incoming segment from tcp_check_req() that failed the seq-number checks. Thus, to find the correct key, we need to use the skb's saddr and not the daddr. This bug seems to have been there since quite a while, but probably got unnoticed because the consequences are not catastrophic. We will call tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer, thus the connection doesn't really fail. Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().") Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77ea45da0fe9..94e28350f420 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1f04ec0e4a7a..7178476b3d2f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -994,7 +994,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 0, 0); } -- cgit v1.2.3 From b9b312a7a451e9c098921856e7cfbc201120e1a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:03:38 -0800 Subject: ipv6: mcast: better catch silly mtu values syzkaller reported crashes in IPv6 stack [1] Xin Long found that lo MTU was set to silly values. IPv6 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in mld code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv6 minimal MTU. [1] skbuff: skb_over_panic: text:0000000010b86b8d len:196 put:20 head:000000003b477e60 data:000000000e85441e tail:0xd4 end:0xc0 dev:lo ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc2-mm1+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x15c/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801db307508 EFLAGS: 00010286 RAX: 0000000000000082 RBX: ffff8801c517e840 RCX: 0000000000000000 RDX: 0000000000000082 RSI: 1ffff1003b660e61 RDI: ffffed003b660e95 RBP: ffff8801db307570 R08: 1ffff1003b660e23 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff85bd4020 R13: ffffffff84754ed2 R14: 0000000000000014 R15: ffff8801c4e26540 FS: 0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000463610 CR3: 00000001c6698000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_over_panic net/core/skbuff.c:109 [inline] skb_put+0x181/0x1c0 net/core/skbuff.c:1694 add_grhead.isra.24+0x42/0x3b0 net/ipv6/mcast.c:1695 add_grec+0xa55/0x1060 net/ipv6/mcast.c:1817 mld_send_cr net/ipv6/mcast.c:1903 [inline] mld_ifc_timer_expire+0x4d2/0x770 net/ipv6/mcast.c:2448 call_timer_fn+0x23b/0x840 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7e1/0xb60 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x29d/0xbb2 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1d3/0x210 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:920 Signed-off-by: Eric Dumazet Reported-by: syzbot Tested-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fc6d7d143f2c..844642682b83 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, struct mld2_grec **ppgr) + int type, struct mld2_grec **ppgr, unsigned int mtu) { - struct net_device *dev = pmc->idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr; - if (!skb) - skb = mld_newpack(pmc->idev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = mld_newpack(pmc->idev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV6_MIN_MTU) + return skb; + isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); } } first = 1; @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -1814,7 +1819,7 @@ empty_source: mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) -- cgit v1.2.3 From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:17:39 -0800 Subject: ipv4: igmp: guard against silly MTU values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPv4 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in igmp code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv4 minimal MTU. This patch adds missing IPV4_MIN_MTU define, to not abuse ETH_MIN_MTU anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/devinet.c | 2 +- net/ipv4/igmp.c | 24 +++++++++++++++--------- net/ipv4/ip_tunnel.c | 4 ++-- 4 files changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..af8addbaa3c1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -34,6 +34,7 @@ #include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ +#define IPV4_MIN_MTU 68 /* RFC 791 */ struct sock; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4573bccd6da..7a93359fbc72 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1428,7 +1428,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d1f8f302dbf3..50448a220a1f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +544,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe6fee728ce4..5ddb1cb52bd4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } -- cgit v1.2.3 From 83593010d3b87601e775f240ce46c53ddf25828d Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Mon, 11 Dec 2017 22:09:46 +0530 Subject: net: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 1 - net/dsa/slave.c | 1 - net/netfilter/nf_conntrack_netlink.c | 1 - net/sched/act_meta_mark.c | 1 - net/sched/act_meta_skbtcindex.c | 1 - net/sched/cls_api.c | 1 - net/sched/cls_u32.c | 1 - 7 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 1c4810919a0a..b9057478d69c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d6e7a642493b..a95a55f79137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..332b51870ed7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #ifdef CONFIG_NF_NAT_NEEDED #include diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 1e3f10e5da99..6445184b2759 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbmark_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 2ea1f26c9e96..7221437ca3a6 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ddcf04b4ab43..f40256a3e7f0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ac152b4f4247..507859cdd1cb 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -45,7 +45,6 @@ #include #include #include -#include #include struct tc_u_knode { -- cgit v1.2.3 From c545a945d0d9ea2ea2c7d23d43cf0d86e32cd7cf Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 11 Dec 2017 19:11:55 +0100 Subject: tipc: eliminate potential memory leak In the function tipc_sk_mcast_rcv() we call refcount_dec(&skb->users) on received sk_buffers. Since the reference counter might hit zero at this point, we have a potential memory leak. We fix this by replacing refcount_dec() with kfree_skb(). Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d18c0caa92b..41127d0b925e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1140,7 +1140,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, __skb_dequeue(arrvq); __skb_queue_tail(inputq, skb); } - refcount_dec(&skb->users); + kfree_skb(skb); spin_unlock_bh(&inputq->lock); continue; } -- cgit v1.2.3 From a46182b00290839fa3fa159d54fd3237bd8669f0 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Mon, 11 Dec 2017 11:13:45 -0800 Subject: net: igmp: Use correct source address on IGMPv3 reports Closing a multicast socket after the final IPv4 address is deleted from an interface can generate a membership report that uses the source IP from a different interface. The following test script, run from an isolated netns, reproduces the issue: #!/bin/bash ip link add dummy0 type dummy ip link add dummy1 type dummy ip link set dummy0 up ip link set dummy1 up ip addr add 10.1.1.1/24 dev dummy0 ip addr add 192.168.99.99/24 dev dummy1 tcpdump -U -i dummy0 & socat EXEC:"sleep 2" \ UDP4-DATAGRAM:239.101.1.68:8889,ip-add-membership=239.0.1.68:10.1.1.1 & sleep 1 ip addr del 10.1.1.1/24 dev dummy0 sleep 5 kill %tcpdump RFC 3376 specifies that the report must be sent with a valid IP source address from the destination subnet, or from address 0.0.0.0. Add an extra check to make sure this is the case. Signed-off-by: Kevin Cernekee Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 50448a220a1f..726f6b608274 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (inet_ifa_match(fl4->saddr, ifa)) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); -- cgit v1.2.3 From 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 12 Dec 2017 16:28:58 -0800 Subject: tcp: fix potential underestimation on rcv_rtt When ms timestamp is used, current logic uses 1us in tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us. This could cause rcv_rtt underestimation. Fix it by always using a min value of 1ms if ms timestamp is used. Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9550cc42de2d..45f750e85714 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -508,9 +508,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; - if (m == 0) - m = 1; - if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially @@ -547,6 +544,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + if (!delta_us) + delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -563,8 +562,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + u32 delta_us; + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); tcp_rcv_rtt_update(tp, delta_us, 0); } } -- cgit v1.2.3 From 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 18:22:52 -0800 Subject: tcp: refresh tcp_mstamp from timers callbacks Only the retransmit timer currently refreshes tcp_mstamp We should do the same for delayed acks and keepalives. Even if RFC 7323 does not request it, this is consistent to what linux did in the past, when TS values were based on jiffies. Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Mike Maloney Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 16df6dd44b98..968fda198376 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } + tcp_mstamp_refresh(tcp_sk(sk)); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } @@ -632,6 +633,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } + tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 1 + fs/nfsd/auth.c | 3 +++ include/linux/cred.h | 1 + kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 8 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..f650e475d8f0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; -- cgit v1.2.3 From 2d17d8d79e77ff3f1b35b87522fc72fa562260ff Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 14 Dec 2017 17:17:56 -0800 Subject: xdp: linearize skb in netif_receive_generic_xdp() In netif_receive_generic_xdp(), it is necessary to linearize all nonlinear skb. However, in current implementation, skb with troom <= 0 are not linearized. This patch fixes this by calling skb_linearize() for all nonlinear skb. Fixes: de8f3a83b0a0 ("bpf: add meta pointer for direct access") Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f47e96b62308..01ee854454a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3904,7 +3904,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; - if (troom > 0 && __skb_linearize(skb)) + if (skb_linearize(skb)) goto do_drop; } -- cgit v1.2.3 From 35b99dffc3f710cafceee6c8c6ac6a98eb2cb4bf Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 13 Dec 2017 14:41:06 -0500 Subject: sock: free skb in skb_complete_tx_timestamp on error skb_complete_tx_timestamp must ingest the skb it is passed. Call kfree_skb if the skb cannot be enqueued. Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl") Fixes: 9ac25fc06375 ("net: fix socket refcounting in skb_complete_tx_timestamp()") Reported-by: Richard Cochran Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6b0ff396fa9d..a592ca025fc4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4293,7 +4293,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) - return; + goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. @@ -4302,7 +4302,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); + return; } + +err: + kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); -- cgit v1.2.3 From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:29 +0200 Subject: net: sched: Add TCA_HW_OFFLOAD Qdiscs can be offloaded to HW, but current implementation isn't uniform. Instead, qdiscs either pass information about offload status via their TCA_OPTIONS or omit it altogether. Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform uAPI for the offloading status of qdiscs. Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 1 + net/sched/sch_api.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..83a3e47d5845 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d8b5f80c2ea6..843e29aa3cac 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -557,6 +557,7 @@ enum { TCA_PAD, TCA_DUMP_INVISIBLE, TCA_CHAIN, + TCA_HW_OFFLOAD, __TCA_MAX }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b6c4f536876b..0f1eab99ff4e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; + if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; qlen = q->q.qlen; -- cgit v1.2.3 From 428a68af3a7c3a3380ff1f750a24d213f370f89f Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:30 +0200 Subject: net: sched: Move to new offload indication in RED Let RED utilize the new internal flag, TCQ_F_OFFLOADED, to mark a given qdisc as offloaded instead of using a dedicated indication. Also, change internal logic into looking at said flag when possible. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_red.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 9d874e60e032..f0747eb87dc4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable) .handle = sch->handle, .parent = sch->parent, }; + int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.command = TC_RED_DESTROY; } - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + + if (!err && enable) + sch->flags |= TCQ_F_OFFLOADED; + else + sch->flags &= ~TCQ_F_OFFLOADED; + + return err; } static void red_destroy(struct Qdisc *sch) @@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) return red_change(sch, opt); } -static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) { struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { @@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = &sch->qstats, }, }; - int err; - opt->flags &= ~TC_RED_OFFLOADED; - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, - &hw_stats); - if (err == -EOPNOTSUPP) + if (!(sch->flags & TCQ_F_OFFLOADED)) return 0; - if (!err) - opt->flags |= TC_RED_OFFLOADED; - - return err; + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) int err; sch->qstats.backlog = q->qdisc->qstats.backlog; - err = red_dump_offload(sch, &opt); + err = red_dump_offload_stats(sch, &opt); if (err) goto nla_put_failure; @@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + if (sch->flags & TCQ_F_OFFLOADED) { struct red_stats hw_stats = {0}; struct tc_red_qopt_offload hw_stats_request = { .command = TC_RED_XSTATS, -- cgit v1.2.3 From c05fad5713b81b049ec6ac4eb2d304030b1efdce Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 15 Dec 2017 10:46:16 +0800 Subject: ip_gre: fix wrong return value of erspan_rcv If pskb_may_pull return failed, return PACKET_REJECT instead of -ENOMEM. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Cc: William Tu Signed-off-by: Haishuang Yan Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bb6239169b1a..9c1735632c8c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -266,7 +266,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, len = gre_hdr_len + sizeof(*ershdr); if (unlikely(!pskb_may_pull(skb, len))) - return -ENOMEM; + return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); -- cgit v1.2.3 From ccede7598588ae344143f82fb763912535648d58 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:04:04 -0500 Subject: xprtrdma: Spread reply processing over more CPUs Commit d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion") introduced a performance regression for NFS I/O small enough to not need memory registration. In multi- threaded benchmarks that generate primarily small I/O requests, IOPS throughput is reduced by nearly a third. This patch restores the previous level of throughput. Because workqueues are typically BOUND (in particular ib_comp_wq, nfsiod_workqueue, and rpciod_workqueue), NFS/RDMA workloads tend to aggregate on the CPU that is handling Receive completions. The usual approach to addressing this problem is to create a QP and CQ for each CPU, and then schedule transactions on the QP for the CPU where you want the transaction to complete. The transaction then does not require an extra context switch during completion to end up on the same CPU where the transaction was started. This approach doesn't work for the Linux NFS/RDMA client because currently the Linux NFS client does not support multiple connections per client-server pair, and the RDMA core API does not make it straightforward for ULPs to determine which CPU is responsible for handling Receive completions for a CQ. So for the moment, record the CPU number in the rpcrdma_req before the transport sends each RPC Call. Then during Receive completion, queue the RPC completion on that same CPU. Additionally, move all RPC completion processing to the deferred handler so that even RPCs with simple small replies complete on the CPU that sent the corresponding RPC Call. Fixes: d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler ...") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +----- net/sunrpc/xprtrdma/transport.c | 2 ++ net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ed34dc0f144c..a3f2ab283aeb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1408,11 +1408,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); - if (list_empty(&req->rl_registered) && - !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) - rpcrdma_complete_rqst(rep); - else - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 646c24494ea7..6ee1ad8978f3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "xprt_rdma.h" @@ -656,6 +657,7 @@ xprt_rdma_allocate(struct rpc_task *task) task->tk_pid, __func__, rqst->rq_callsize, rqst->rq_rcvsize, req); + req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 710b3f77db82..8607c029c0dd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void) struct workqueue_struct *recv_wq; recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!recv_wq) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 51686d9eac5f..1342f743f1c4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -342,6 +342,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; + int rl_cpu; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; -- cgit v1.2.3 From 90d91b0cd371193d9dbfa9beacab8ab9a4cb75e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Dec 2017 21:24:08 -0500 Subject: SUNRPC: Fix a race in the receive code path We must ensure that the call to rpc_sleep_on() in xprt_transmit() cannot race with the call to xprt_complete_rqst(). Reported-by: Chuck Lever Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=317 Fixes: ce7c252a8c74 ("SUNRPC: Add a separate spinlock to protect..") Cc: stable@vger.kernel.org # 4.14+ Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 02a9bacb239b..5b06f6906a27 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1001,6 +1001,7 @@ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + unsigned int connect_cookie; int status, numreqs; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1024,6 +1025,7 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; + connect_cookie = xprt->connect_cookie; req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); @@ -1047,20 +1049,28 @@ void xprt_transmit(struct rpc_task *task) xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; + spin_unlock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else { + req->rq_connect_cookie = connect_cookie; + if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { /* - * Sleep on the pending queue since - * we're expecting a reply. + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). */ - if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) + spin_lock(&xprt->recv_lock); + if (!req->rq_reply_bytes_recvd) { rpc_sleep_on(&xprt->pending, task, xprt_timer); - req->rq_connect_cookie = xprt->connect_cookie; + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (!xprt_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock(&xprt->recv_lock); } - spin_unlock_bh(&xprt->transport_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) -- cgit v1.2.3 From 343723dd51ef1025a860e54df9472b5ba21ee3d9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:12 +0100 Subject: net: sched: fix clsact init error path Since in qdisc_create, the destroy op is called when init fails, we don't do cleanup in init and leave it up to destroy. This fixes use-after-free when trying to put already freed block. Fixes: 6e40cf2d4dee ("net: sched: use extended variants of block_get/put in ingress and clsact qdiscs") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- net/sched/sch_ingress.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f40256a3e7f0..b91ea03e3afa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,6 +351,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { struct tcf_chain *chain; + if (!block) + return; /* Hold a refcnt for all chains, except 0, so that they don't disappear * while we are iterating. */ @@ -377,8 +379,6 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; - if (!block) - return; tcf_block_put_ext(block, block->q, &ei); } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5ecc38f35d47..5e1cd2e5df87 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -190,7 +190,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) - goto err_egress_block_get; + return err; net_inc_ingress_queue(); net_inc_egress_queue(); @@ -198,10 +198,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_CPUSTATS; return 0; - -err_egress_block_get: - tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); - return err; } static void clsact_destroy(struct Qdisc *sch) -- cgit v1.2.3 From b59e6979a86384e68b0ab6ffeab11f0034fba82d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:13 +0100 Subject: net: sched: fix static key imbalance in case of ingress/clsact_init error Move static key increments to the beginning of the init function so they pair 1:1 with decrements in ingress/clsact_destroy, which is called in case ingress/clsact_init fails. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_ingress.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5e1cd2e5df87..fc1286f499c1 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -68,6 +68,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -78,7 +80,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; return 0; @@ -172,6 +173,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + net_inc_egress_queue(); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -192,9 +196,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); - net_inc_egress_queue(); - sch->flags |= TCQ_F_CPUSTATS; return 0; -- cgit v1.2.3 From 588753f1eb18978512b1c9b85fddb457d46f9033 Mon Sep 17 00:00:00 2001 From: Brendan McGrath Date: Wed, 13 Dec 2017 22:14:57 +1100 Subject: ipv6: icmp6: Allow icmp messages to be looped back One example of when an ICMPv6 packet is required to be looped back is when a host acts as both a Multicast Listener and a Multicast Router. A Multicast Router will listen on address ff02::16 for MLDv2 messages. Currently, MLDv2 messages originating from a Multicast Listener running on the same host as the Multicast Router are not being delivered to the Multicast Router. This is due to dst.input being assigned the default value of dst_discard. This results in the packet being looped back but discarded before being delivered to the Multicast Router. This patch sets dst.input to ip6_input to ensure a looped back packet is delivered to the Multicast Router. Signed-off-by: Brendan McGrath Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a8d1500d374..2bc91c349273 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2336,6 +2336,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, } rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; -- cgit v1.2.3 From 234833991e14681f61cbfd93e65a5c976089cf11 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 17:34:16 +0100 Subject: tipc: fix lost member events bug Group messages are not supposed to be returned to sender when the destination socket disappears. This is done correctly for regular traffic messages, by setting the 'dest_droppable' bit in the header. But we forget to do that in group protocol messages. This has the effect that such messages may sometimes bounce back to the sender, be perceived as a legitimate peer message, and wreak general havoc for the rest of the session. In particular, we have seen that a member in state LEAVING may go back to state RECLAIMED or REMITTED, hence causing suppression of an otherwise expected 'member down' event to the user. We fix this by setting the 'dest_droppable' bit even in group protocol messages. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 95fec2c057d6..efb5714e7a85 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -648,6 +648,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } + msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } -- cgit v1.2.3 From 3f42f5fe31c8715a34064bfd7b788488d1ea2f7c Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 18:13:34 +0100 Subject: tipc: remove leaving group member from all lists A group member going into state LEAVING should never go back to any other state before it is finally deleted. However, this might happen if the socket needs to send out a RECLAIM message during this interval. Since we forget to remove the leaving member from the group's 'active' or 'pending' list, the member might be selected for reclaiming, change state to RECLAIMING, and get stuck in this state instead of being deleted. This might lead to suppression of the expected 'member down' event to the receiver. We fix this by removing the member from all lists, except the RB tree, at the moment it goes into state LEAVING. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index efb5714e7a85..b96ec429bb9b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -699,6 +699,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); + list_del_init(&m->list); + list_del_init(&m->congested); + *usr_wakeup = true; /* Wait until WITHDRAW event is received */ if (m->state != MBR_LEAVING) { @@ -710,8 +713,6 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, ehdr = buf_msg(m->event_msg); msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); - *usr_wakeup = true; - list_del_init(&m->congested); return; case GRP_ADV_MSG: if (!m) @@ -863,6 +864,7 @@ void tipc_group_member_evt(struct tipc_group *grp, msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); __skb_queue_tail(inputq, skb); } + list_del_init(&m->list); list_del_init(&m->congested); } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); -- cgit v1.2.3 From 5c468674d17056148da06218d4da5d04baf22eac Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:07:25 +0800 Subject: sctp: fix the issue that a __u16 variable may overflow in sctp_ulpq_renege Now when reneging events in sctp_ulpq_renege(), the variable freed could be increased by a __u16 value twice while freed is of __u16 type. It means freed may overflow at the second addition. This patch is to fix it by using __u32 type for 'freed', while at it, also to remove 'if (chunk)' check, as all renege commands are generated in sctp_eat_data and it can't be NULL. Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/ulpqueue.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index a71be33f3afe..e36ec5dd64c6 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1084,29 +1084,21 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { - struct sctp_association *asoc; - __u16 needed, freed; - - asoc = ulpq->asoc; + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed; - if (chunk) { - needed = ntohs(chunk->chunk_hdr->length); - needed -= sizeof(struct sctp_data_chunk); - } else - needed = SCTP_DEFAULT_MAXWINDOW; - - freed = 0; + needed = ntohs(chunk->chunk_hdr->length) - + sizeof(struct sctp_data_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); - if (freed < needed) { + if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); - } } /* If able to free enough room, accept this chunk. */ - if (chunk && (freed >= needed)) { - int retval; - retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); + if (freed >= needed) { + int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. -- cgit v1.2.3 From d196975905b2bb227dc54547c03b3d9d0013805c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:13:17 +0800 Subject: sctp: add SCTP_CID_RECONF conversion in sctp_cname Whenever a new type of chunk is added, the corresp conversion in sctp_cname should be added. Otherwise, in some places, pr_debug will print it as "unknown chunk". Fixes: cc16f00f6529 ("sctp: add support for generating stream reconf ssn reset request chunk") Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 3f619fdcbf0a..291c97b07058 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -78,6 +78,9 @@ const char *sctp_cname(const union sctp_subtype cid) case SCTP_CID_AUTH: return "AUTH"; + case SCTP_CID_RECONF: + return "RECONF"; + default: break; } -- cgit v1.2.3 From 84aeb437ab98a2bce3d4b2111c79723aedfceb33 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 18 Dec 2017 17:35:09 +0200 Subject: net: bridge: fix early call to br_stp_change_bridge_id and plug newlink leaks The early call to br_stp_change_bridge_id in bridge's newlink can cause a memory leak if an error occurs during the newlink because the fdb entries are not cleaned up if a different lladdr was specified, also another minor issue is that it generates fdb notifications with ifindex = 0. Another unrelated memory leak is the bridge sysfs entries which get added on NETDEV_REGISTER event, but are not cleaned up in the newlink error path. To remove this special case the call to br_stp_change_bridge_id is done after netdev register and we cleanup the bridge on changelink error via br_dev_delete to plug all leaks. This patch makes netlink bridge destruction on newlink error the same as dellink and ioctl del which is necessary since at that point we have a fully initialized bridge device. To reproduce the issue: $ ip l add br0 address 00:11:22:33:44:55 type bridge group_fwd_mask 1 RTNETLINK answers: Invalid argument $ rmmod bridge [ 1822.142525] ============================================================================= [ 1822.143640] BUG bridge_fdb_cache (Tainted: G O ): Objects remaining in bridge_fdb_cache on __kmem_cache_shutdown() [ 1822.144821] ----------------------------------------------------------------------------- [ 1822.145990] Disabling lock debugging due to kernel taint [ 1822.146732] INFO: Slab 0x0000000092a844b2 objects=32 used=2 fp=0x00000000fef011b0 flags=0x1ffff8000000100 [ 1822.147700] CPU: 2 PID: 13584 Comm: rmmod Tainted: G B O 4.15.0-rc2+ #87 [ 1822.148578] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 1822.150008] Call Trace: [ 1822.150510] dump_stack+0x78/0xa9 [ 1822.151156] slab_err+0xb1/0xd3 [ 1822.151834] ? __kmalloc+0x1bb/0x1ce [ 1822.152546] __kmem_cache_shutdown+0x151/0x28b [ 1822.153395] shutdown_cache+0x13/0x144 [ 1822.154126] kmem_cache_destroy+0x1c0/0x1fb [ 1822.154669] SyS_delete_module+0x194/0x244 [ 1822.155199] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 1822.155773] entry_SYSCALL_64_fastpath+0x23/0x9a [ 1822.156343] RIP: 0033:0x7f929bd38b17 [ 1822.156859] RSP: 002b:00007ffd160e9a98 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [ 1822.157728] RAX: ffffffffffffffda RBX: 00005578316ba090 RCX: 00007f929bd38b17 [ 1822.158422] RDX: 00007f929bd9ec60 RSI: 0000000000000800 RDI: 00005578316ba0f0 [ 1822.159114] RBP: 0000000000000003 R08: 00007f929bff5f20 R09: 00007ffd160e8a11 [ 1822.159808] R10: 00007ffd160e9860 R11: 0000000000000202 R12: 00007ffd160e8a80 [ 1822.160513] R13: 0000000000000000 R14: 0000000000000000 R15: 00005578316ba090 [ 1822.161278] INFO: Object 0x000000007645de29 @offset=0 [ 1822.161666] INFO: Object 0x00000000d5df2ab5 @offset=128 Fixes: 30313a3d5794 ("bridge: Handle IFLA_ADDRESS correctly when creating bridge device") Fixes: 5b8d5429daa0 ("bridge: netlink: register netdevice before executing changelink") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d0ef0a8e8831..015f465c514b 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1262,19 +1262,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct net_bridge *br = netdev_priv(dev); int err; + err = register_netdevice(dev); + if (err) + return err; + if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } - err = register_netdevice(dev); - if (err) - return err; - err = br_changelink(dev, tb, data, extack); if (err) - unregister_netdevice(dev); + br_dev_delete(dev, NULL); + return err; } -- cgit v1.2.3 From acf568ee859f098279eadf551612f103afdacb4e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 Dec 2017 16:40:44 +1100 Subject: xfrm: Reinject transport-mode packets through tasklet This is an old bugbear of mine: https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html By crafting special packets, it is possible to cause recursion in our kernel when processing transport-mode packets at levels that are only limited by packet size. The easiest one is with DNAT, but an even worse one is where UDP encapsulation is used in which case you just have to insert an UDP encapsulation header in between each level of recursion. This patch avoids this problem by reinjecting tranport-mode packets through a tasklet. Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ net/ipv4/xfrm4_input.c | 12 ++++++++++- net/ipv6/xfrm6_input.c | 10 ++++++++- net/xfrm/xfrm_input.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dc28a98ce97c..ae35991b5877 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1570,6 +1570,9 @@ int xfrm_init_state(struct xfrm_state *x); int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index e50b7fea57ee..bcfc00e88756 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } +static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dst_input(skb); +} + static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, iph->tos, skb->dev)) goto drop; } - return dst_input(skb); + + if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) + goto drop; + + return 0; drop: kfree_skb(skb); return NET_RX_DROP; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index fe04e23af986..841f4a07438e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, } EXPORT_SYMBOL(xfrm6_rcv_spi); +static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + if (xfrm_trans_queue(skb, ip6_rcv_finish)) + __kfree_skb(skb); + return -1; +} + int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); @@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_rcv_finish); + xfrm6_transport_finish2); return -1; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index da6447389ffb..3f6f6f8c9fa5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -8,15 +8,29 @@ * */ +#include +#include #include #include #include +#include #include #include #include #include #include +struct xfrm_trans_tasklet { + struct tasklet_struct tasklet; + struct sk_buff_head queue; +}; + +struct xfrm_trans_cb { + int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); +}; + +#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) + static struct kmem_cache *secpath_cachep __read_mostly; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); @@ -25,6 +39,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; +static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); + int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; @@ -477,9 +493,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); +static void xfrm_trans_reinject(unsigned long data) +{ + struct xfrm_trans_tasklet *trans = (void *)data; + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&trans->queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); +} + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct xfrm_trans_tasklet *trans; + + trans = this_cpu_ptr(&xfrm_trans_tasklet); + + if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + return -ENOBUFS; + + XFRM_TRANS_SKB_CB(skb)->finish = finish; + skb_queue_tail(&trans->queue, skb); + tasklet_schedule(&trans->tasklet); + return 0; +} +EXPORT_SYMBOL(xfrm_trans_queue); + void __init xfrm_input_init(void) { int err; + int i; init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); @@ -490,4 +538,13 @@ void __init xfrm_input_init(void) sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + for_each_possible_cpu(i) { + struct xfrm_trans_tasklet *trans; + + trans = &per_cpu(xfrm_trans_tasklet, i); + __skb_queue_head_init(&trans->queue); + tasklet_init(&trans->tasklet, xfrm_trans_reinject, + (unsigned long)trans); + } } -- cgit v1.2.3 From 5d32407396b0433f9b738fcfcb9599bcba7379ae Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 14 Dec 2017 14:33:38 +0100 Subject: cfg80211: always rewrite generated files from scratch Currently the certs C code generation appends to the generated files, which is most likely a leftover from commit 715a12334764 ("wireless: don't write C files on failures"). This causes duplicate code in the generated files if the certificates have their timestamps modified between builds and thereby trigger the generation rules. Fixes: 715a12334764 ("wireless: don't write C files on failures") Signed-off-by: Thierry Reding Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index d7d6cb00c47b..b662be3422e1 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -43,7 +43,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) echo "$$allf"; \ echo '};'; \ echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ - ) >> $@) + ) > $@) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @@ -66,4 +66,4 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ echo "$$allf"; \ echo '};'; \ echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ - ) >> $@) + ) > $@) -- cgit v1.2.3 From 04a7279ff12fc47b657f70731d401c0064f5838a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Dec 2017 09:26:17 +0100 Subject: cfg80211: ship certificates as hex files Not only does this remove the need for the hexdump code in most normal kernel builds (still there for the extra directory), but it also removes the need to ship binary files, which apparently is somewhat problematic, as Randy reported. While at it, also add the generated files to clean-files. Reported-by: Randy Dunlap Signed-off-by: Johannes Berg --- net/wireless/Makefile | 29 ++++--------- net/wireless/certs/sforshee.hex | 86 +++++++++++++++++++++++++++++++++++++++ net/wireless/certs/sforshee.x509 | Bin 680 -> 0 bytes 3 files changed, 95 insertions(+), 20 deletions(-) create mode 100644 net/wireless/certs/sforshee.hex delete mode 100644 net/wireless/certs/sforshee.x509 (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index b662be3422e1..1d84f91bbfb0 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -23,27 +23,14 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) cfg80211-y += extra-certs.o endif -$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) +$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) @$(kecho) " GEN $@" - @(set -e; \ - allf=""; \ - for f in $^ ; do \ - # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ - thisf=$$(od -An -v -tx1 < $$f | \ - sed -e 's/ /\n/g' | \ - sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ - sed -e 's/^/0x/;s/$$/,/'); \ - # file should not be empty - maybe command substitution failed? \ - test ! -z "$$thisf";\ - allf=$$allf$$thisf;\ - done; \ - ( \ - echo '#include "reg.h"'; \ - echo 'const u8 shipped_regdb_certs[] = {'; \ - echo "$$allf"; \ - echo '};'; \ - echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ - ) > $@) + @(echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + cat $^ ; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) > $@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @@ -67,3 +54,5 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ echo '};'; \ echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ ) > $@) + +clean-files += shipped-certs.c extra-certs.c diff --git a/net/wireless/certs/sforshee.hex b/net/wireless/certs/sforshee.hex new file mode 100644 index 000000000000..14ea66643ffa --- /dev/null +++ b/net/wireless/certs/sforshee.hex @@ -0,0 +1,86 @@ +/* Seth Forshee's regdb certificate */ +0x30, 0x82, 0x02, 0xa4, 0x30, 0x82, 0x01, 0x8c, +0x02, 0x09, 0x00, 0xb2, 0x8d, 0xdf, 0x47, 0xae, +0xf9, 0xce, 0xa7, 0x30, 0x0d, 0x06, 0x09, 0x2a, +0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, +0x05, 0x00, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, +0x06, 0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, +0x66, 0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, +0x20, 0x17, 0x0d, 0x31, 0x37, 0x31, 0x30, 0x30, +0x36, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, 0x5a, +0x18, 0x0f, 0x32, 0x31, 0x31, 0x37, 0x30, 0x39, +0x31, 0x32, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, +0x5a, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, 0x06, +0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, 0x66, +0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, 0x82, +0x01, 0x22, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, +0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, +0x00, 0x03, 0x82, 0x01, 0x0f, 0x00, 0x30, 0x82, +0x01, 0x0a, 0x02, 0x82, 0x01, 0x01, 0x00, 0xb5, +0x40, 0xe3, 0x9c, 0x28, 0x84, 0x39, 0x03, 0xf2, +0x39, 0xd7, 0x66, 0x2c, 0x41, 0x38, 0x15, 0xac, +0x7e, 0xa5, 0x83, 0x71, 0x25, 0x7e, 0x90, 0x7c, +0x68, 0xdd, 0x6f, 0x3f, 0xd9, 0xd7, 0x59, 0x38, +0x9f, 0x7c, 0x6a, 0x52, 0xc2, 0x03, 0x2a, 0x2d, +0x7e, 0x66, 0xf4, 0x1e, 0xb3, 0x12, 0x70, 0x20, +0x5b, 0xd4, 0x97, 0x32, 0x3d, 0x71, 0x8b, 0x3b, +0x1b, 0x08, 0x17, 0x14, 0x6b, 0x61, 0xc4, 0x57, +0x8b, 0x96, 0x16, 0x1c, 0xfd, 0x24, 0xd5, 0x0b, +0x09, 0xf9, 0x68, 0x11, 0x84, 0xfb, 0xca, 0x51, +0x0c, 0xd1, 0x45, 0x19, 0xda, 0x10, 0x44, 0x8a, +0xd9, 0xfe, 0x76, 0xa9, 0xfd, 0x60, 0x2d, 0x18, +0x0b, 0x28, 0x95, 0xb2, 0x2d, 0xea, 0x88, 0x98, +0xb8, 0xd1, 0x56, 0x21, 0xf0, 0x53, 0x1f, 0xf1, +0x02, 0x6f, 0xe9, 0x46, 0x9b, 0x93, 0x5f, 0x28, +0x90, 0x0f, 0xac, 0x36, 0xfa, 0x68, 0x23, 0x71, +0x57, 0x56, 0xf6, 0xcc, 0xd3, 0xdf, 0x7d, 0x2a, +0xd9, 0x1b, 0x73, 0x45, 0xeb, 0xba, 0x27, 0x85, +0xef, 0x7a, 0x7f, 0xa5, 0xcb, 0x80, 0xc7, 0x30, +0x36, 0xd2, 0x53, 0xee, 0xec, 0xac, 0x1e, 0xe7, +0x31, 0xf1, 0x36, 0xa2, 0x9c, 0x63, 0xc6, 0x65, +0x5b, 0x7f, 0x25, 0x75, 0x68, 0xa1, 0xea, 0xd3, +0x7e, 0x00, 0x5c, 0x9a, 0x5e, 0xd8, 0x20, 0x18, +0x32, 0x77, 0x07, 0x29, 0x12, 0x66, 0x1e, 0x36, +0x73, 0xe7, 0x97, 0x04, 0x41, 0x37, 0xb1, 0xb1, +0x72, 0x2b, 0xf4, 0xa1, 0x29, 0x20, 0x7c, 0x96, +0x79, 0x0b, 0x2b, 0xd0, 0xd8, 0xde, 0xc8, 0x6c, +0x3f, 0x93, 0xfb, 0xc5, 0xee, 0x78, 0x52, 0x11, +0x15, 0x1b, 0x7a, 0xf6, 0xe2, 0x68, 0x99, 0xe7, +0xfb, 0x46, 0x16, 0x84, 0xe3, 0xc7, 0xa1, 0xe6, +0xe0, 0xd2, 0x46, 0xd5, 0xe1, 0xc4, 0x5f, 0xa0, +0x66, 0xf4, 0xda, 0xc4, 0xff, 0x95, 0x1d, 0x02, +0x03, 0x01, 0x00, 0x01, 0x30, 0x0d, 0x06, 0x09, +0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, +0x0b, 0x05, 0x00, 0x03, 0x82, 0x01, 0x01, 0x00, +0x87, 0x03, 0xda, 0xf2, 0x82, 0xc2, 0xdd, 0xaf, +0x7c, 0x44, 0x2f, 0x86, 0xd3, 0x5f, 0x4c, 0x93, +0x48, 0xb9, 0xfe, 0x07, 0x17, 0xbb, 0x21, 0xf7, +0x25, 0x23, 0x4e, 0xaa, 0x22, 0x0c, 0x16, 0xb9, +0x73, 0xae, 0x9d, 0x46, 0x7c, 0x75, 0xd9, 0xc3, +0x49, 0x57, 0x47, 0xbf, 0x33, 0xb7, 0x97, 0xec, +0xf5, 0x40, 0x75, 0xc0, 0x46, 0x22, 0xf0, 0xa0, +0x5d, 0x9c, 0x79, 0x13, 0xa1, 0xff, 0xb8, 0xa3, +0x2f, 0x7b, 0x8e, 0x06, 0x3f, 0xc8, 0xb6, 0xe4, +0x6a, 0x28, 0xf2, 0x34, 0x5c, 0x23, 0x3f, 0x32, +0xc0, 0xe6, 0xad, 0x0f, 0xac, 0xcf, 0x55, 0x74, +0x47, 0x73, 0xd3, 0x01, 0x85, 0xb7, 0x0b, 0x22, +0x56, 0x24, 0x7d, 0x9f, 0x09, 0xa9, 0x0e, 0x86, +0x9e, 0x37, 0x5b, 0x9c, 0x6d, 0x02, 0xd9, 0x8c, +0xc8, 0x50, 0x6a, 0xe2, 0x59, 0xf3, 0x16, 0x06, +0xea, 0xb2, 0x42, 0xb5, 0x58, 0xfe, 0xba, 0xd1, +0x81, 0x57, 0x1a, 0xef, 0xb2, 0x38, 0x88, 0x58, +0xf6, 0xaa, 0xc4, 0x2e, 0x8b, 0x5a, 0x27, 0xe4, +0xa5, 0xe8, 0xa4, 0xca, 0x67, 0x5c, 0xac, 0x72, +0x67, 0xc3, 0x6f, 0x13, 0xc3, 0x2d, 0x35, 0x79, +0xd7, 0x8a, 0xe7, 0xf5, 0xd4, 0x21, 0x30, 0x4a, +0xd5, 0xf6, 0xa3, 0xd9, 0x79, 0x56, 0xf2, 0x0f, +0x10, 0xf7, 0x7d, 0xd0, 0x51, 0x93, 0x2f, 0x47, +0xf8, 0x7d, 0x4b, 0x0a, 0x84, 0x55, 0x12, 0x0a, +0x7d, 0x4e, 0x3b, 0x1f, 0x2b, 0x2f, 0xfc, 0x28, +0xb3, 0x69, 0x34, 0xe1, 0x80, 0x80, 0xbb, 0xe2, +0xaf, 0xb9, 0xd6, 0x30, 0xf1, 0x1d, 0x54, 0x87, +0x23, 0x99, 0x9f, 0x51, 0x03, 0x4c, 0x45, 0x7d, +0x02, 0x65, 0x73, 0xab, 0xfd, 0xcf, 0x94, 0xcc, +0x0d, 0x3a, 0x60, 0xfd, 0x3c, 0x14, 0x2f, 0x16, +0x33, 0xa9, 0x21, 0x1f, 0xcb, 0x50, 0xb1, 0x8f, +0x03, 0xee, 0xa0, 0x66, 0xa9, 0x16, 0x79, 0x14, diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509 deleted file mode 100644 index c6f8f9d6b988..000000000000 Binary files a/net/wireless/certs/sforshee.x509 and /dev/null differ -- cgit v1.2.3 From cfddd4c33c254954927942599d299b3865743146 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:24:35 +0800 Subject: ip_gre: remove the incorrect mtu limit for ipgre tap ipgre tap driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the ipgre tap device to not be greater than 1500, this limit value is not correct for ipgre tap device. Besides, it's .change_mtu already does the right check. So this patch is just to set max_mtu as 0, and leave the check to it's .change_mtu. Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9c1735632c8c..45ffd3d045d2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1310,6 +1310,7 @@ static const struct net_device_ops erspan_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &gre_tap_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; -- cgit v1.2.3 From 2c52129a7d74d017320804c6928de770815c5f4a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:25:09 +0800 Subject: ip6_gre: remove the incorrect mtu limit for ipgre tap The same fix as the patch "ip_gre: remove the incorrect mtu limit for ipgre tap" is also needed for ip6_gre. Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4cfd8e0696fe..416c8913f132 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1308,6 +1308,7 @@ static void ip6gre_tap_setup(struct net_device *dev) ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; -- cgit v1.2.3 From c9fefa08190fc879fb2e681035d7774e0a8c5170 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:26:21 +0800 Subject: ip6_tunnel: get the min mtu properly in ip6_tnl_xmit Now it's using IPV6_MIN_MTU as the min mtu in ip6_tnl_xmit, but IPV6_MIN_MTU actually only works when the inner packet is ipv6. With IPV6_MIN_MTU for ipv4 packets, the new pmtu for inner dst couldn't be set less than 1280. It would cause tx_err and the packet to be dropped when the outer dst pmtu is close to 1280. Jianlin found it by running ipv4 traffic with the topo: (client) gre6 <---> eth1 (route) eth2 <---> gre6 (server) After changing eth2 mtu to 1300, the performance became very low, or the connection was even broken. The issue also affects ip4ip6 and ip6ip6 tunnels. So if the inner packet is ipv4, 576 should be considered as the min mtu. Note that for ip4ip6 and ip6ip6 tunnels, the inner packet can only be ipv4 or ipv6, but for gre6 tunnel, it may also be ARP. This patch using 576 as the min mtu for non-ipv6 packet works for all those cases. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index db84f523656d..931c38f6ff4a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1123,8 +1123,13 @@ route_lookup: max_headroom += 8; mtu -= 8; } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } else if (mtu < 576) { + mtu = 576; + } + if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { -- cgit v1.2.3 From 3db096011722fd8717e57687ae94b6917a11c9cc Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 20:03:05 +0100 Subject: tipc: fix list sorting bug in function tipc_group_update_member() When, during a join operation, or during message transmission, a group member needs to be added to the group's 'congested' list, we sort it into the list in ascending order, according to its current advertised window size. However, we miss the case when the member is already on that list. This will have the result that the member, after the window size has been decremented, might be at the wrong position in that list. This again may have the effect that we during broadcast and multicast transmissions miss the fact that a destination is not yet ready for reception, and we end up sending anyway. From this point on, the behavior during the remaining session is unpredictable, e.g., with underflowing window sizes. We now correct this bug by unconditionally removing the member from the list before (re-)sorting it in. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index b96ec429bb9b..bbc004eaa31a 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -351,8 +351,7 @@ void tipc_group_update_member(struct tipc_member *m, int len) if (m->window >= ADV_IDLE) return; - if (!list_empty(&m->congested)) - return; + list_del_init(&m->congested); /* Sort member into congested members' list */ list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { -- cgit v1.2.3 From d03a45572efa068fa64db211d6d45222660e76c5 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 19 Dec 2017 15:17:13 +0100 Subject: ipv4: fib: Fix metrics match when deleting a route The recently added fib_metrics_match() causes a regression for routes with both RTAX_FEATURES and RTAX_CC_ALGO if the latter has TCP_CONG_NEEDS_ECN flag set: | # ip link add d0 type dummy | # ip link set d0 up | # ip route add 172.29.29.0/24 dev d0 features ecn congctl dctcp | # ip route del 172.29.29.0/24 dev d0 features ecn congctl dctcp | RTNETLINK answers: No such process During route insertion, fib_convert_metrics() detects that the given CC algo requires ECN and hence sets DST_FEATURE_ECN_CA bit in RTAX_FEATURES. During route deletion though, fib_metrics_match() compares stored RTAX_FEATURES value with that from userspace (which obviously has no knowledge about DST_FEATURE_ECN_CA) and fails. Fixes: 5f9ae3d9e7e4a ("ipv4: do metrics match when looking up and deleting a route") Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f04d944f8abe..c586597da20d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -698,7 +698,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); - u32 val; + u32 fi_val, val; if (!type) continue; @@ -715,7 +715,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) val = nla_get_u32(nla); } - if (fi->fib_metrics->metrics[type - 1] != val) + fi_val = fi->fib_metrics->metrics[type - 1]; + if (type == RTAX_FEATURES) + fi_val &= ~DST_FEATURE_ECN_CA; + + if (fi_val != val) return false; } -- cgit v1.2.3 From 21b5944350052d2583e82dd59b19a9ba94a007f0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 19 Dec 2017 11:27:56 -0600 Subject: net: Fix double free and memory corruption in get_net_ns_by_id() (I can trivially verify that that idr_remove in cleanup_net happens after the network namespace count has dropped to zero --EWB) Function get_net_ns_by_id() does not check for net::count after it has found a peer in netns_ids idr. It may dereference a peer, after its count has already been finaly decremented. This leads to double free and memory corruption: put_net(peer) rtnl_lock() atomic_dec_and_test(&peer->count) [count=0] ... __put_net(peer) get_net_ns_by_id(net, id) spin_lock(&cleanup_list_lock) list_add(&net->cleanup_list, &cleanup_list) spin_unlock(&cleanup_list_lock) queue_work() peer = idr_find(&net->netns_ids, id) | get_net(peer) [count=1] | ... | (use after final put) v ... cleanup_net() ... spin_lock(&cleanup_list_lock) ... list_replace_init(&cleanup_list, ..) ... spin_unlock(&cleanup_list_lock) ... ... ... ... put_net(peer) ... atomic_dec_and_test(&peer->count) [count=0] ... spin_lock(&cleanup_list_lock) ... list_add(&net->cleanup_list, &cleanup_list) ... spin_unlock(&cleanup_list_lock) ... queue_work() ... rtnl_unlock() rtnl_lock() ... for_each_net(tmp) { ... id = __peernet2id(tmp, peer) ... spin_lock_irq(&tmp->nsid_lock) ... idr_remove(&tmp->netns_ids, id) ... ... ... net_drop_ns() ... net_free(peer) ... } ... | v cleanup_net() ... (Second free of peer) Also, put_net() on the right cpu may reorder with left's cpu list_replace_init(&cleanup_list, ..), and then cleanup_list will be corrupted. Since cleanup_net() is executed in worker thread, while put_net(peer) can happen everywhere, there should be enough time for concurrent get_net_ns_by_id() to pick the peer up, and the race does not seem to be unlikely. The patch fixes the problem in standard way. (Also, there is possible problem in peernet2id_alloc(), which requires check for net::count under nsid_lock and maybe_get_net(peer), but in current stable kernel it's used under rtnl_lock() and it has to be safe. Openswitch begun to use peernet2id_alloc(), and possibly it should be fixed too. While this is not in stable kernel yet, so I'll send a separate message to netdev@ later). Cc: Nicolas Dichtel Signed-off-by: Kirill Tkhai Fixes: 0c7aecd4bde4 "netns: add rtnl cmd to add and get peer netns ids" Reviewed-by: Andrey Ryabinin Reviewed-by: "Eric W. Biederman" Signed-off-by: Eric W. Biederman Reviewed-by: Eric Dumazet Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b797832565d3..60a71be75aea 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -267,7 +267,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) - get_net(peer); + peer = maybe_get_net(peer); spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); -- cgit v1.2.3 From 102740bd9436a3a6ba129af3a48271d794009fa5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Dec 2017 13:32:13 -0800 Subject: cls_bpf: fix offload assumptions after callback conversion cls_bpf used to take care of tracking what offload state a filter is in, i.e. it would track if offload request succeeded or not. This information would then be used to issue correct requests to the driver, e.g. requests for statistics only on offloaded filters, removing only filters which were offloaded, using add instead of replace if previous filter was not added etc. This tracking of offload state no longer functions with the new callback infrastructure. There could be multiple entities trying to offload the same filter. Throw out all the tracking and corresponding commands and simply pass to the drivers both old and new bpf program. Drivers will have to deal with offload state tracking by themselves. Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload") Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 +--- include/net/pkt_cls.h | 5 +- net/sched/cls_bpf.c | 93 +++++++++++---------------- 3 files changed, 43 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index e379b78e86ef..a4cf62ba4604 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -110,16 +110,10 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type, return -EOPNOTSUPP; } - switch (cls_bpf->command) { - case TC_CLSBPF_REPLACE: - return nfp_net_bpf_offload(nn, cls_bpf->prog, true); - case TC_CLSBPF_ADD: - return nfp_net_bpf_offload(nn, cls_bpf->prog, false); - case TC_CLSBPF_DESTROY: - return nfp_net_bpf_offload(nn, NULL, true); - default: + if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; - } + + return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog); } static int nfp_bpf_setup_tc_block(struct net_device *netdev, diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0105445cab83..8e08b6da72f3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -694,9 +694,7 @@ struct tc_cls_matchall_offload { }; enum tc_clsbpf_command { - TC_CLSBPF_ADD, - TC_CLSBPF_REPLACE, - TC_CLSBPF_DESTROY, + TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; @@ -705,6 +703,7 @@ struct tc_cls_bpf_offload { enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; + struct bpf_prog *oldprog; const char *name; bool exts_integrated; u32 gen_flags; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6fe798c2df1a..8d78e7f4ecc3 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -42,7 +42,6 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; - bool offloaded; u32 gen_flags; struct tcf_exts exts; u32 handle; @@ -148,33 +147,37 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) } static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, - enum tc_clsbpf_command cmd) + struct cls_bpf_prog *oldprog) { - bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; struct tcf_block *block = tp->chain->block; - bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *obj; + bool skip_sw; int err; + skip_sw = prog && tc_skip_sw(prog->gen_flags); + obj = prog ?: oldprog; + tc_cls_common_offload_init(&cls_bpf.common, tp); - cls_bpf.command = cmd; - cls_bpf.exts = &prog->exts; - cls_bpf.prog = prog->filter; - cls_bpf.name = prog->bpf_name; - cls_bpf.exts_integrated = prog->exts_integrated; - cls_bpf.gen_flags = prog->gen_flags; + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &obj->exts; + cls_bpf.prog = prog ? prog->filter : NULL; + cls_bpf.oldprog = oldprog ? oldprog->filter : NULL; + cls_bpf.name = obj->bpf_name; + cls_bpf.exts_integrated = obj->exts_integrated; + cls_bpf.gen_flags = obj->gen_flags; err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); - if (addorrep) { + if (prog) { if (err < 0) { - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + cls_bpf_offload_cmd(tp, oldprog, prog); return err; } else if (err > 0) { prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; } } - if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) + if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; @@ -183,38 +186,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - struct cls_bpf_prog *obj = prog; - enum tc_clsbpf_command cmd; - bool skip_sw; - int ret; - - skip_sw = tc_skip_sw(prog->gen_flags) || - (oldprog && tc_skip_sw(oldprog->gen_flags)); - - if (oldprog && oldprog->offloaded) { - if (!tc_skip_hw(prog->gen_flags)) { - cmd = TC_CLSBPF_REPLACE; - } else if (!tc_skip_sw(prog->gen_flags)) { - obj = oldprog; - cmd = TC_CLSBPF_DESTROY; - } else { - return -EINVAL; - } - } else { - if (tc_skip_hw(prog->gen_flags)) - return skip_sw ? -EINVAL : 0; - cmd = TC_CLSBPF_ADD; - } - - ret = cls_bpf_offload_cmd(tp, obj, cmd); - if (ret) - return ret; + if (prog && oldprog && prog->gen_flags != oldprog->gen_flags) + return -EINVAL; - obj->offloaded = true; - if (oldprog) - oldprog->offloaded = false; + if (prog && tc_skip_hw(prog->gen_flags)) + prog = NULL; + if (oldprog && tc_skip_hw(oldprog->gen_flags)) + oldprog = NULL; + if (!prog && !oldprog) + return 0; - return 0; + return cls_bpf_offload_cmd(tp, prog, oldprog); } static void cls_bpf_stop_offload(struct tcf_proto *tp, @@ -222,25 +204,26 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp, { int err; - if (!prog->offloaded) - return; - - err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); - if (err) { + err = cls_bpf_offload_cmd(tp, NULL, prog); + if (err) pr_err("Stopping hardware offload failed: %d\n", err); - return; - } - - prog->offloaded = false; } static void cls_bpf_offload_update_stats(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - if (!prog->offloaded) - return; + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + + tc_cls_common_offload_init(&cls_bpf.common, tp); + cls_bpf.command = TC_CLSBPF_STATS; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + cls_bpf.gen_flags = prog->gen_flags; - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false); } static int cls_bpf_init(struct tcf_proto *tp) -- cgit v1.2.3 From bb25c3855a12cc58e33cd7ee9b69943790fe35f7 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 20 Dec 2017 11:03:15 +0100 Subject: tipc: remove joining group member from congested list When we receive a JOIN message from a peer member, the message may contain an advertised window value ADV_IDLE that permits removing the member in question from the tipc_group::congested list. However, since the removal has been made conditional on that the advertised window is *not* ADV_IDLE, we miss this case. This has the effect that a sender sometimes may enter a state of permanent, false, broadcast congestion. We fix this by unconditinally removing the member from the congested list before calling tipc_member_update(), which might potentially sort it into the list again. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index bbc004eaa31a..7ebbdeb2a90e 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -689,10 +689,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); } - if (m->window < ADV_IDLE) - tipc_group_update_member(m, 0); - else - list_del_init(&m->congested); + list_del_init(&m->congested); + tipc_group_update_member(m, 0); return; case GRP_LEAVE_MSG: if (!m) -- cgit v1.2.3 From b4681c2829e24943aadd1a7bb3a30d41d0a20050 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2017 19:34:19 +0200 Subject: ipv4: Fix use-after-free when flushing FIB tables Since commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") the local table uses the same trie allocated for the main table when custom rules are not in use. When a net namespace is dismantled, the main table is flushed and freed (via an RCU callback) before the local table. In case the callback is invoked before the local table is iterated, a use-after-free can occur. Fix this by iterating over the FIB tables in reverse order, so that the main table is always freed after the local table. v3: Reworded comment according to Alex's suggestion. v2: Add a comment to make the fix more explicit per Dave's and Alex's feedback. Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") Signed-off-by: Ido Schimmel Reported-by: Fengguang Wu Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f52d27a422c3..08259d078b1c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1298,14 +1298,19 @@ err_table_hash_alloc: static void ip_fib_net_exit(struct net *net) { - unsigned int i; + int i; rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + /* Destroy the tables in reverse order to guarantee that the + * local table, ID 255, is destroyed before the main table, ID + * 254. This is necessary as the local table may contain + * references to data contained in the main table. + */ + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; -- cgit v1.2.3 From 24c0df82ef7919e4d10cf2e4e65d368eb2e8ea21 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2017 12:01:21 +0100 Subject: netfilter: nf_tables: fix chain filter in nf_tables_dump_rules() ctx->chain may be null now that we have very large object names, so we cannot check for ctx->chain[0] here. Fixes: b7263e071aba7 ("netfilter: nf_tables: Allow table names of up to 255 chars") Signed-off-by: Pablo Neira Ayuso Acked-by: Phil Sutter --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 10798b357481..8d4526651661 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2072,7 +2072,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(chain, &table->chains, list) { - if (ctx && ctx->chain[0] && + if (ctx && ctx->chain && strcmp(ctx->chain, chain->name) != 0) continue; -- cgit v1.2.3 From 58acfd714e6b02e8617448b431c2b64a2f1f0792 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2017 12:28:25 +0200 Subject: ipv6: Honor specified parameters in fibmatch lookup Currently, parameters such as oif and source address are not taken into account during fibmatch lookup. Example (IPv4 for reference) before patch: $ ip -4 route show 192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1 198.51.100.0/24 dev dummy1 proto kernel scope link src 198.51.100.1 $ ip -6 route show 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium 2001:db8:2::/64 dev dummy1 proto kernel metric 256 pref medium fe80::/64 dev dummy0 proto kernel metric 256 pref medium fe80::/64 dev dummy1 proto kernel metric 256 pref medium $ ip -4 route get fibmatch 192.0.2.2 oif dummy0 192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1 $ ip -4 route get fibmatch 192.0.2.2 oif dummy1 RTNETLINK answers: No route to host $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium After: $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1 RTNETLINK answers: Network is unreachable The problem stems from the fact that the necessary route lookup flags are not set based on these parameters. Instead of duplicating the same logic for fibmatch, we can simply resolve the original route from its copy and dump it instead. Fixes: 18c3a61c4264 ("net: ipv6: RTM_GETROUTE: return matched fib result when requested") Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2bc91c349273..0458b761f3c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4298,19 +4298,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - if (!fibmatch) - dst = ip6_route_input_lookup(net, dev, &fl6, flags); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_input_lookup(net, dev, &fl6, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; - if (!fibmatch) - dst = ip6_route_output(net, NULL, &fl6); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_output(net, NULL, &fl6); } @@ -4327,6 +4321,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + if (fibmatch && rt->dst.from) { + struct rt6_info *ort = container_of(rt->dst.from, + struct rt6_info, dst); + + dst_hold(&ort->dst); + ip6_rt_put(rt); + rt = ort; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); -- cgit v1.2.3 From c48e74736fccf25fb32bb015426359e1c2016e3b Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 20 Dec 2017 15:09:22 -0500 Subject: openvswitch: Fix pop_vlan action for double tagged frames skb_vlan_pop() expects skb->protocol to be a valid TPID for double tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop() shift the true ethertype into position for us. Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets") Signed-off-by: Eric Garver Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index dbe2379329c5..f039064ce922 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -579,6 +579,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -EINVAL; skb_reset_network_header(skb); + key->eth.type = skb->protocol; } else { eth = eth_hdr(skb); ether_addr_copy(key->eth.src, eth->h_source); @@ -592,15 +593,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(parse_vlan(skb, key))) return -ENOMEM; - skb->protocol = parse_ethertype(skb); - if (unlikely(skb->protocol == htons(0))) + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) return -ENOMEM; + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; + skb_reset_network_header(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); } skb_reset_mac_len(skb); - key->eth.type = skb->protocol; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { -- cgit v1.2.3 From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 12:10:21 -0800 Subject: net: reevalulate autoflowlabel setting after sysctl setting sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2. If sockopt doesn't set autoflowlabel, outcome packets from the hosts are supposed to not include flowlabel. This is true for normal packet, but not for reset packet. The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't changed, so the sock will keep the old behavior in terms of auto flowlabel. Reset packet is suffering from this problem, because reset packet is sent from a special control socket, which is created at boot time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control socket will always have its ipv6_pinfo.autoflowlabel set, even after user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always have flowlabel. Normal sock created before sysctl setting suffers from the same issue. We can't even turn off autoflowlabel unless we kill all socks in the hosts. To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the autoflowlabel setting from user, otherwise we always call ip6_default_np_autolabel() which has the new settings of sysctl. Note, this changes behavior a little bit. Before commit 42240901f7c4 (ipv6: Implement different admin modes for automatic flow labels), the autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes, existing connection will change autoflowlabel behavior. After that commit, autoflowlabel behavior is sticky in the whole life of the sock. With this patch, the behavior isn't sticky again. Cc: Martin KaFai Lau Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: Shaohua Li Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 ++- net/ipv6/af_inet6.c | 1 - net/ipv6/ip6_output.c | 12 ++++++++++-- net/ipv6/ipv6_sockglue.c | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb18c6290ca8..8415bf1a9776 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -273,7 +273,8 @@ struct ipv6_pinfo { * 100: prefer care-of address */ dontfrag:1, - autoflowlabel:1; + autoflowlabel:1, + autoflowlabel_set:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c26f71234b9c..c9441ca45399 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,6 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(net); np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..f7dd51c42314 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +{ + if (!np->autoflowlabel_set) + return ip6_default_np_autolabel(net); + else + return np->autoflowlabel; +} + /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified @@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b9404feabd78..2d4680e0376f 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -886,6 +886,7 @@ pref_skip_coa: break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; + np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: -- cgit v1.2.3 From 268b790679422a89e9ab0685d9f291edae780c98 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 20 Dec 2017 17:37:49 -0500 Subject: skbuff: orphan frags before zerocopy clone Call skb_zerocopy_clone after skb_orphan_frags, to avoid duplicate calls to skb_uarg(skb)->callback for the same data. skb_zerocopy_clone associates skb_shinfo(skb)->uarg from frag_skb with each segment. This is only safe for uargs that do refcounting, which is those that pass skb_orphan_frags without dropping their shared frags. For others, skb_orphan_frags drops the user frags and sets the uarg to NULL, after which sock_zerocopy_clone has no effect. Qemu hangs were reported due to duplicate vhost_net_zerocopy_callback calls for the same data causing the vhost_net_ubuf_ref_>refcount to drop below zero. Link: http://lkml.kernel.org/r/ Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Reported-by: Andreas Hartmann Reported-by: David Hill Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a592ca025fc4..edf40ac0cd07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3654,8 +3654,6 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) - goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -3681,6 +3679,8 @@ normal: if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) goto err; + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; *nskb_frag = *frag; __skb_frag_ref(nskb_frag); -- cgit v1.2.3 From b90ddd568792bcb0054eaf0f61785c8f80c3bd1c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 20 Dec 2017 17:37:50 -0500 Subject: skbuff: skb_copy_ubufs must release uarg even without user frags skb_copy_ubufs creates a private copy of frags[] to release its hold on user frags, then calls uarg->callback to notify the owner. Call uarg->callback even when no frags exist. This edge case can happen when zerocopy_sg_from_iter finds enough room in skb_headlen to copy all the data. Fixes: 3ece782693c4 ("sock: skb_copy_ubufs support for compound pages") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edf40ac0cd07..a3cb0be4c6f3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1178,7 +1178,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) u32 d_off; if (!num_frags) - return 0; + goto release; if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; @@ -1238,6 +1238,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; +release: skb_zcopy_clear(skb, false); return 0; } -- cgit v1.2.3 From 8bea728dce8972e534e6b99fd550f7b5cc3864e8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 25 Dec 2017 11:34:54 +0800 Subject: netfilter: nf_tables: fix potential NULL-ptr deref in nf_tables_dump_obj_done() If there is no NFTA_OBJ_TABLE and NFTA_OBJ_TYPE, the c.data will be NULL in nf_tables_getobj(). So before free filter->table in nf_tables_dump_obj_done(), we need to check if filter is NULL first. Fixes: e46abbcc05aa ("netfilter: nf_tables: Allow table names of up to 255 chars") Signed-off-by: Hangbin Liu Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d4526651661..07bd4138c84e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4665,8 +4665,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_filter *filter = cb->data; - kfree(filter->table); - kfree(filter); + if (filter) { + kfree(filter->table); + kfree(filter); + } return 0; } -- cgit v1.2.3 From e5a9336adb317db55eb3fe8200856096f3c71109 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 20 Dec 2017 19:36:03 +0300 Subject: ip6_gre: fix device features for ioctl setup When ip6gre is created using ioctl, its features, such as scatter-gather, GSO and tx-checksumming will be turned off: # ip -f inet6 tunnel add gre6 mode ip6gre remote fd00::1 # ethtool -k gre6 (truncated output) tx-checksumming: off scatter-gather: off tcp-segmentation-offload: off generic-segmentation-offload: off [requested on] But when netlink is used, they will be enabled: # ip link add gre6 type ip6gre remote fd00::1 # ethtool -k gre6 (truncated output) tx-checksumming: on scatter-gather: on tcp-segmentation-offload: on generic-segmentation-offload: on This results in a loss of performance when gre6 is created via ioctl. The issue was found with LTP/gre tests. Fix it by moving the setup of device features to a separate function and invoke it with ndo_init callback because both netlink and ioctl will eventually call it via register_netdevice(): register_netdevice() - ndo_init() callback -> ip6gre_tunnel_init() or ip6gre_tap_init() - ip6gre_tunnel_init_common() - ip6gre_tnl_init_features() The moved code also contains two minor style fixes: * removed needless tab from GRE6_FEATURES on NETIF_F_HIGHDMA line. * fixed the issue reported by checkpatch: "Unnecessary parentheses around 'nt->encap.type == TUNNEL_ENCAP_NONE'" Fixes: ac4eb009e477 ("ip6gre: Add support for basic offloads offloads excluding GSO") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 57 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 416c8913f132..772695960890 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1014,6 +1014,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1048,6 +1078,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + ip6gre_tnl_init_features(dev); + return 0; } @@ -1298,11 +1330,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) - static void ip6gre_tap_setup(struct net_device *dev) { @@ -1383,26 +1410,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } - err = register_netdevice(dev); if (err) goto out; -- cgit v1.2.3 From b2fb01f426883a794ed80be9110675a2d8356347 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Dec 2017 23:26:24 -0800 Subject: net_sched: fix a missing rcu barrier in mini_qdisc_pair_swap() The rcu_barrier_bh() in mini_qdisc_pair_swap() is to wait for flying RCU callback installed by a previous mini_qdisc_pair_swap(), however we miss it on the tp_head==NULL path, which leads to that the RCU callback still uses miniq_old->rcu after it is freed together with qdisc in qdisc_graft(). So just add it on that path too. Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath ") Reported-by: Jakub Kicinski Tested-by: Jakub Kicinski Cc: Jiri Pirko Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cd1b200acae7..661c7144b53a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1040,6 +1040,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); + /* Wait for flying RCU callback before it is freed. */ + rcu_barrier_bh(); return; } @@ -1055,7 +1057,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, rcu_assign_pointer(*miniqp->p_miniq, miniq); if (miniq_old) - /* This is counterpart of the rcu barrier above. We need to + /* This is counterpart of the rcu barriers above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ -- cgit v1.2.3 From 0a3d805c9c503e05d6e5d3868c53e92a06589dcf Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 21 Dec 2017 13:07:11 +0100 Subject: tipc: base group replicast ack counter on number of actual receivers In commit 2f487712b893 ("tipc: guarantee that group broadcast doesn't bypass group unicast") we introduced a mechanism that requires the first (replicated) broadcast sent after a unicast to be acknowledged by all receivers before permitting sending of the next (true) broadcast. The counter for keeping track of the number of acknowledges to expect is based on the tipc_group::member_cnt variable. But this misses that some of the known members may not be ready for reception, and will never acknowledge the message, either because they haven't fully joined the group or because they are leaving the group. Such members are identified by not fulfilling the condition tested for in the function tipc_group_is_enabled(). We now set the counter for the actual number of acks to receive at the moment the message is sent, by just counting the number of recipients satisfying the tipc_group_is_enabled() test. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 7ebbdeb2a90e..e5b03f08f076 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -368,18 +368,20 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; + u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_enabled(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; + ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) - grp->bc_ackers = grp->member_cnt; + grp->bc_ackers = ackers; grp->bc_snd_nxt++; } -- cgit v1.2.3 From 4853f128c13ed2731625dff2410b7fdbe540fb26 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 21 Dec 2017 13:13:59 +0100 Subject: net: sched: fix possible null pointer deref in tcf_block_put We need to check block for being null in both tcf_block_put and tcf_block_put_ext. Fixes: 343723dd51ef ("net: sched: fix clsact init error path") Reported-by: Prashant Bhole Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b91ea03e3afa..b9d63d2246e6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -379,6 +379,8 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; + if (!block) + return; tcf_block_put_ext(block, block->q, &ei); } -- cgit v1.2.3 From 3a33a19bf88cdfc6d982972bc6ffcf7a62c1015e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 21 Dec 2017 14:36:34 +0100 Subject: tipc: fix memory leak of group member when peer node is lost When a group member receives a member WITHDRAW event, this might have two reasons: either the peer member is leaving the group, or the link to the member's node has been lost. In the latter case we need to issue a DOWN event to the user right away, and let function tipc_group_filter_msg() perform delete of the member item. However, in this case we miss to change the state of the member item to MBR_LEAVING, so the member item is not deleted, and we have a memory leak. We now separate better between the four sub-cases of a WITHRAW event and make sure that each case is handled correctly. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index e5b03f08f076..8e12ab55346b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -850,17 +850,26 @@ void tipc_group_member_evt(struct tipc_group *grp, *usr_wakeup = true; m->usr_pending = false; node_up = tipc_node_is_up(net, node); - - /* Hold back event if more messages might be expected */ - if (m->state != MBR_LEAVING && node_up) { - m->event_msg = skb; - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - } else { - if (node_up) + m->event_msg = NULL; + + if (node_up) { + /* Hold back event if a LEAVE msg should be expected */ + if (m->state != MBR_LEAVING) { + m->event_msg = skb; + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + } else { msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - else + __skb_queue_tail(inputq, skb); + } + } else { + if (m->state != MBR_LEAVING) { + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); + } else { + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + } __skb_queue_tail(inputq, skb); } list_del_init(&m->list); -- cgit v1.2.3 From 14e138a86f6347c6199f610576d2e11c03bec5f0 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Thu, 21 Dec 2017 20:17:04 -0800 Subject: RDS: Check cmsg_len before dereferencing CMSG_DATA RDS currently doesn't check if the length of the control message is large enough to hold the required data, before dereferencing the control message data. This results in following crash: BUG: KASAN: stack-out-of-bounds in rds_rdma_bytes net/rds/send.c:1013 [inline] BUG: KASAN: stack-out-of-bounds in rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066 Read of size 8 at addr ffff8801c928fb70 by task syzkaller455006/3157 CPU: 0 PID: 3157 Comm: syzkaller455006 Not tainted 4.15.0-rc3+ #161 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 rds_rdma_bytes net/rds/send.c:1013 [inline] rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 ___sys_sendmsg+0x320/0x8b0 net/socket.c:2018 __sys_sendmmsg+0x1ee/0x620 net/socket.c:2108 SYSC_sendmmsg net/socket.c:2139 [inline] SyS_sendmmsg+0x35/0x60 net/socket.c:2134 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x43fe49 RSP: 002b:00007fffbe244ad8 EFLAGS: 00000217 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fe49 RDX: 0000000000000001 RSI: 000000002020c000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000217 R12: 00000000004017b0 R13: 0000000000401840 R14: 0000000000000000 R15: 0000000000000000 To fix this, we verify that the cmsg_len is large enough to hold the data to be read, before proceeding further. Reported-by: syzbot Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Reviewed-by: Yuval Shaia Signed-off-by: David S. Miller --- net/rds/send.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index b52cdc8ae428..f72466c63f0c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + if (cmsg->cmsg_len < + CMSG_LEN(sizeof(struct rds_rdma_args))) + return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } -- cgit v1.2.3 From 19142551b2be4a9e13838099fde1351386e5e007 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Fri, 22 Dec 2017 09:35:16 +0200 Subject: tipc: error path leak fixes in tipc_enable_bearer() Fix memory leak in tipc_enable_bearer() if enable_media() fails, and cleanup with bearer_disable() if tipc_mon_create() fails. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tommi Rantala Signed-off-by: David S. Miller --- net/tipc/bearer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 47ec121574ce..c8001471da6c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -324,6 +324,7 @@ restart: if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); + kfree(b); return -EINVAL; } @@ -347,8 +348,10 @@ restart: if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - if (tipc_mon_create(net, bearer_id)) + if (tipc_mon_create(net, bearer_id)) { + bearer_disable(net, b); return -ENOMEM; + } pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, -- cgit v1.2.3 From 642a8439ddd8423b92f2e71960afe21ee1f66bb6 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Fri, 22 Dec 2017 09:35:17 +0200 Subject: tipc: fix tipc_mon_delete() oops in tipc_enable_bearer() error path Calling tipc_mon_delete() before the monitor has been created will oops. This can happen in tipc_enable_bearer() error path if tipc_disc_create() fails. [ 48.589074] BUG: unable to handle kernel paging request at 0000000000001008 [ 48.590266] IP: tipc_mon_delete+0xea/0x270 [tipc] [ 48.591223] PGD 1e60c5067 P4D 1e60c5067 PUD 1eb0cf067 PMD 0 [ 48.592230] Oops: 0000 [#1] SMP KASAN [ 48.595610] CPU: 5 PID: 1199 Comm: tipc Tainted: G B 4.15.0-rc4-pc64-dirty #5 [ 48.597176] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 [ 48.598489] RIP: 0010:tipc_mon_delete+0xea/0x270 [tipc] [ 48.599347] RSP: 0018:ffff8801d827f668 EFLAGS: 00010282 [ 48.600705] RAX: ffff8801ee813f00 RBX: 0000000000000204 RCX: 0000000000000000 [ 48.602183] RDX: 1ffffffff1de6a75 RSI: 0000000000000297 RDI: 0000000000000297 [ 48.604373] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff1dd1533 [ 48.605607] R10: ffffffff8eafbb05 R11: fffffbfff1dd1534 R12: 0000000000000050 [ 48.607082] R13: dead000000000200 R14: ffffffff8e73f310 R15: 0000000000001020 [ 48.608228] FS: 00007fc686484800(0000) GS:ffff8801f5540000(0000) knlGS:0000000000000000 [ 48.610189] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 48.611459] CR2: 0000000000001008 CR3: 00000001dda70002 CR4: 00000000003606e0 [ 48.612759] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 48.613831] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 48.615038] Call Trace: [ 48.615635] tipc_enable_bearer+0x415/0x5e0 [tipc] [ 48.620623] tipc_nl_bearer_enable+0x1ab/0x200 [tipc] [ 48.625118] genl_family_rcv_msg+0x36b/0x570 [ 48.631233] genl_rcv_msg+0x5a/0xa0 [ 48.631867] netlink_rcv_skb+0x1cc/0x220 [ 48.636373] genl_rcv+0x24/0x40 [ 48.637306] netlink_unicast+0x29c/0x350 [ 48.639664] netlink_sendmsg+0x439/0x590 [ 48.642014] SYSC_sendto+0x199/0x250 [ 48.649912] do_syscall_64+0xfd/0x2c0 [ 48.650651] entry_SYSCALL64_slow_path+0x25/0x25 [ 48.651843] RIP: 0033:0x7fc6859848e3 [ 48.652539] RSP: 002b:00007ffd25dff938 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 48.654003] RAX: ffffffffffffffda RBX: 00007ffd25dff990 RCX: 00007fc6859848e3 [ 48.655303] RDX: 0000000000000054 RSI: 00007ffd25dff990 RDI: 0000000000000003 [ 48.656512] RBP: 00007ffd25dff980 R08: 00007fc685c35fc0 R09: 000000000000000c [ 48.657697] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000d13010 [ 48.658840] R13: 00007ffd25e009c0 R14: 0000000000000000 R15: 0000000000000000 [ 48.662972] RIP: tipc_mon_delete+0xea/0x270 [tipc] RSP: ffff8801d827f668 [ 48.664073] CR2: 0000000000001008 [ 48.664576] ---[ end trace e811818d54d5ce88 ]--- Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tommi Rantala Signed-off-by: David S. Miller --- net/tipc/monitor.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 8e884ed06d4b..32dc33a94bc7 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -642,9 +642,13 @@ void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); - struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *self; struct tipc_peer *peer, *tmp; + if (!mon) + return; + + self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { -- cgit v1.2.3 From 8cb38a602478e9f806571f6920b0a3298aabf042 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 22 Dec 2017 10:15:20 -0800 Subject: sctp: Replace use of sockets_allocated with specified macro. The patch(180d8cd942ce) replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to accessor macros. But the sockets_allocated field of sctp sock is not replaced at all. Then replace it now for unifying the code. Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.") Cc: Glauber Costa Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3253f724a995..b4fb6e4886d2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4498,7 +4498,7 @@ static int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); local_bh_disable(); - percpu_counter_inc(&sctp_sockets_allocated); + sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); /* Nothing can fail after this block, otherwise @@ -4542,7 +4542,7 @@ static void sctp_destroy_sock(struct sock *sk) } sctp_endpoint_free(sp->ep); local_bh_disable(); - percpu_counter_dec(&sctp_sockets_allocated); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } -- cgit v1.2.3 From 517d7c79bdb39864e617960504bdc1aa560c75c6 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 28 Dec 2017 12:03:06 +0100 Subject: tipc: fix hanging poll() for stream sockets In commit 42b531de17d2f6 ("tipc: Fix missing connection request handling"), we replaced unconditional wakeup() with condtional wakeup for clients with flags POLLIN | POLLRDNORM | POLLRDBAND. This breaks the applications which do a connect followed by poll with POLLOUT flag. These applications are not woken when the connection is ESTABLISHED and hence sleep forever. In this commit, we fix it by including the POLLOUT event for sockets in TIPC_CONNECTING state. Fixes: 42b531de17d2f6 ("tipc: Fix missing connection request handling") Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 41127d0b925e..3b4084480377 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -727,11 +727,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: + case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: - case TIPC_CONNECTING: if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= POLLIN | POLLRDNORM; break; -- cgit v1.2.3 From f72c4ac695573699dde5b71da1c3b9ef80440616 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 28 Dec 2017 12:38:13 -0500 Subject: skbuff: in skb_copy_ubufs unclone before releasing zerocopy skb_copy_ubufs must unclone before it is safe to modify its skb_shared_info with skb_zcopy_clear. Commit b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags") ensures that all skbs release their zerocopy state, even those without frags. But I forgot an edge case where such an skb arrives that is cloned. The stack does not build such packets. Vhost/tun skbs have their frags orphaned before cloning. TCP skbs only attach zerocopy state when a frag is added. But if TCP packets can be trimmed or linearized, this might occur. Tracing the code I found no instance so far (e.g., skb_linearize ends up calling skb_zcopy_clear if !skb->data_len). Still, it is non-obvious that no path exists. And it is fragile to rely on this. Fixes: b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a3cb0be4c6f3..08f574081315 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1177,12 +1177,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) int i, new_frags; u32 d_off; - if (!num_frags) - goto release; - if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!num_frags) + goto release; + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); -- cgit v1.2.3 From d66fa9ec53c43bba9fa973c16419f6061b7cc3ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 28 Dec 2017 11:00:44 -0800 Subject: strparser: Call sock_owned_by_user_nocheck strparser wants to check socket ownership without producing any warnings. As indicated by the comment in the code, it is permissible for owned_by_user to return true. Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages") Reported-by: syzbot Reported-and-tested-by: Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/strparser/strparser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index c5fda15ba319..1fdab5c4eda8 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp) * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ - if (sock_owned_by_user(strp->sk)) { + if (sock_owned_by_user_nocheck(strp->sk)) { queue_work(strp_wq, &strp->work); return; } -- cgit v1.2.3 From 257a4b018d1b514a1cc738e3ca11b566d8f3a3d8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Dec 2017 17:34:44 +1100 Subject: xfrm: Forbid state updates from changing encap type Currently we allow state updates to competely replace the contents of x->encap. This is bad because on the user side ESP only sets up header lengths depending on encap_type once when the state is first created. This could result in the header lengths getting out of sync with the actual state configuration. In practice key managers will never do a state update to change the encapsulation type. Only the port numbers need to be changed as the peer NAT entry is updated. Therefore this patch adds a check in xfrm_state_update to forbid any changes to the encap_type. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 500b3391f474..1e80f68e2266 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1534,8 +1534,12 @@ out: err = -EINVAL; spin_lock_bh(&x1->lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { - if (x->encap && x1->encap) + if (x->encap && x1->encap && + x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + else if (x->encap || x1->encap) + goto fail; + if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } @@ -1552,6 +1556,8 @@ out: x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } + +fail: spin_unlock_bh(&x1->lock); xfrm_state_put(x1); -- cgit v1.2.3 From 862591bf4f519d1b8d859af720fafeaebdd0162a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Dec 2017 23:25:45 +0100 Subject: xfrm: skip policies marked as dead while rehashing syzkaller triggered following KASAN splat: BUG: KASAN: slab-out-of-bounds in xfrm_hash_rebuild+0xdbe/0xf00 net/xfrm/xfrm_policy.c:618 read of size 2 at addr ffff8801c8e92fe4 by task kworker/1:1/23 [..] Workqueue: events xfrm_hash_rebuild [..] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:428 xfrm_hash_rebuild+0xdbe/0xf00 net/xfrm/xfrm_policy.c:618 process_one_work+0xbbf/0x1b10 kernel/workqueue.c:2112 worker_thread+0x223/0x1990 kernel/workqueue.c:2246 [..] The reproducer triggers: 1016 if (error) { 1017 list_move_tail(&walk->walk.all, &x->all); 1018 goto out; 1019 } in xfrm_policy_walk() via pfkey (it sets tiny rcv space, dump callback returns -ENOBUFS). In this case, *walk is located the pfkey socket struct, so this socket becomes visible in the global policy list. It looks like this is intentional -- phony walker has walk.dead set to 1 and all other places skip such "policies". Ccing original authors of the two commits that seem to expose this issue (first patch missed ->dead check, second patch adds pfkey sockets to policies dumper list). Fixes: 880a6fab8f6ba5b ("xfrm: configure policy hash table thresholds by netlink") Fixes: 12a169e7d8f4b1c ("ipsec: Put dumpers on the dump list") Cc: Herbert Xu Cc: Timo Teras Cc: Christophe Gouault Reported-by: syzbot Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 70aa5cb0c659..2ef6db98e9ba 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -609,7 +609,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead || + xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } -- cgit v1.2.3 From 06b335cb51af018d5feeff5dd4fd53847ddb675a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:13:05 -0600 Subject: af_key: fix buffer overread in verify_address_len() If a message sent to a PF_KEY socket ended with one of the extensions that takes a 'struct sadb_address' but there were not enough bytes remaining in the message for the ->sa_family member of the 'struct sockaddr' which is supposed to follow, then verify_address_len() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. This bug was found using syzkaller with KMSAN. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[24] = { 0 }; struct sadb_msg *msg = (void *)buf; struct sadb_address *addr = (void *)(msg + 1); msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 3; addr->sadb_address_len = 1; addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; write(sock, buf, 24); } Reported-by: Alexander Potapenko Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/key/af_key.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 3dffb892d52c..596499cc8b2f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -401,6 +401,11 @@ static int verify_address_len(const void *p) #endif int len; + if (sp->sadb_address_len < + DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), + sizeof(uint64_t))) + return -EINVAL; + switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); -- cgit v1.2.3 From 4e765b4972af7b07adcb1feb16e7a525ce1f6b28 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:15:23 -0600 Subject: af_key: fix buffer overread in parse_exthdrs() If a message sent to a PF_KEY socket ended with an incomplete extension header (fewer than 4 bytes remaining), then parse_exthdrs() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[17] = { 0 }; struct sadb_msg *msg = (void *)buf; msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 2; write(sock, buf, 17); } Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/key/af_key.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 596499cc8b2f..d40861a048fe 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -516,6 +516,9 @@ static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void * uint16_t ext_type; int ext_len; + if (len < sizeof(*ehdr)) + return -EINVAL; + ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; -- cgit v1.2.3 From 2f10a61cee8fdb9f8da90f5db687e1862b22cf06 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sun, 31 Dec 2017 16:18:56 +0100 Subject: xfrm: fix rcu usage in xfrm_get_type_offload request_module can sleep, thus we cannot hold rcu_read_lock() while calling it. The function also jumps back and takes rcu_read_lock() again (in xfrm_state_get_afinfo()), resulting in an imbalance. This codepath is triggered whenever a new offloaded state is created. Fixes: ffdb5211da1c ("xfrm: Auto-load xfrm offload modules") Reported-by: syzbot+ca425f44816d749e8eb49755567a75ee48cf4a30@syzkaller.appspotmail.com Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1e80f68e2266..429957412633 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -313,13 +313,14 @@ retry: if ((type && !try_module_get(type->owner))) type = NULL; + rcu_read_unlock(); + if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); try_load = 0; goto retry; } - rcu_read_unlock(); return type; } -- cgit v1.2.3 From 55a5ec9b77106ffc05e8c40d7568432bf4696d7b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 2 Jan 2018 11:45:07 -0500 Subject: Revert "net: core: dev_get_valid_name is now the same as dev_alloc_name_ns" This reverts commit 87c320e51519a83c496ab7bfb4e96c8f9c001e89. Changing the error return code in some situations turns out to be harmful in practice. In particular Michael Ellerman reports that DHCP fails on his powerpc machines, and this revert gets things working again. Johannes Berg agrees that this revert is the best course of action for now. Fixes: 029b6d140550 ("Revert "net: core: maybe return -EEXIST in __dev_alloc_name"") Reported-by: Michael Ellerman Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 01ee854454a8..0e0ba36eeac9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1146,7 +1146,19 @@ EXPORT_SYMBOL(dev_alloc_name); int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - return dev_alloc_name_ns(net, dev, name); + BUG_ON(!net); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name_ns(net, dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; } EXPORT_SYMBOL(dev_get_valid_name); -- cgit v1.2.3 From 23263ec86a5f44312d2899323872468752324107 Mon Sep 17 00:00:00 2001 From: Eli Cooper Date: Mon, 25 Dec 2017 10:43:49 +0800 Subject: ip6_tunnel: disable dst caching if tunnel is dual-stack When an ip6_tunnel is in mode 'any', where the transport layer protocol can be either 4 or 41, dst_cache must be disabled. This is because xfrm policies might apply to only one of the two protocols. Caching dst would cause xfrm policies for one protocol incorrectly used for the other. Signed-off-by: Eli Cooper Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 931c38f6ff4a..b263c809d8d4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1074,10 +1074,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } - } else if (!(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only only if the routing decision does - * not depend on the current inner header value + } else if (t->parms.proto != 0 && !(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | + IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only if neither the outer protocol nor the + * routing decision depends on the current inner header value */ use_cache = true; } -- cgit v1.2.3 From 2fa771be953a17f8e0a9c39103464c2574444c62 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 25 Dec 2017 14:45:12 +0800 Subject: ip6_tunnel: allow ip6gre dev mtu to be set below 1280 Commit 582442d6d5bc ("ipv6: Allow the MTU of ipip6 tunnel to be set below 1280") fixed a mtu setting issue. It works for ipip6 tunnel. But ip6gre dev updates the mtu also with ip6_tnl_change_mtu. Since the inner packet over ip6gre can be ipv4 and it's mtu should also be allowed to set below 1280, the same issue also exists on ip6gre. This patch is to fix it by simply changing to check if parms.proto is IPPROTO_IPV6 in ip6_tnl_change_mtu instead, to make ip6gre to go to 'else' branch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b263c809d8d4..9a7cf355bc8c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1677,11 +1677,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); - if (tnl->parms.proto == IPPROTO_IPIP) { - if (new_mtu < ETH_MIN_MTU) + if (tnl->parms.proto == IPPROTO_IPV6) { + if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { - if (new_mtu < IPV6_MIN_MTU) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (new_mtu > 0xFFF8 - dev->hard_header_len) -- cgit v1.2.3 From 3bb23421a504f01551b7cb9dff0e41dbf16656b0 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 26 Dec 2017 07:48:51 +0200 Subject: net/sched: Fix update of lastuse in act modules implementing stats_update We need to update lastuse to to the most updated value between what is already set and the new value. If HW matching fails, i.e. because of an issue, the stats are not updated but it could be that software did match and updated lastuse. Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument") Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to hardware") Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_gact.c | 2 +- net/sched/act_mirred.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e29a48ef7fc3..a0ac42b3ed06 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8b3e59388480..08b61849c2a2 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, -- cgit v1.2.3 From 71891e2dab6b55a870f8f7735e44a2963860b5c6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 29 Dec 2017 10:02:52 -0800 Subject: ethtool: do not print warning for applications using legacy API In kernel log ths message appears on every boot: "warning: `NetworkChangeNo' uses legacy ethtool link settings API, link modes are only partially reported" When ethtool link settings API changed, it started complaining about usages of old API. Ironically, the original patch was from google but the application using the legacy API is chrome. Linux ABI is fixed as much as possible. The kernel must not break it and should not complain about applications using legacy API's. This patch just removes the warning since using legacy API's in Linux is perfectly acceptable. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Stephen Hemminger Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- net/core/ethtool.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f8fcf450a36e..8225416911ae 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -770,15 +770,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev, return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } -static void -warn_incomplete_ethtool_legacy_settings_conversion(const char *details) -{ - char name[sizeof(current->comm)]; - - pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", - get_task_comm(name, current), details); -} - /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, @@ -805,10 +796,8 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) &link_ksettings); if (err < 0) return err; - if (!convert_link_ksettings_to_legacy_settings(&cmd, - &link_ksettings)) - warn_incomplete_ethtool_legacy_settings_conversion( - "link modes are only partially reported"); + convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; -- cgit v1.2.3 From f9c935db8086231a35b7f5c2a53e3f1e10f388ee Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 29 Dec 2017 19:48:02 +0100 Subject: tipc: fix problems with multipoint-to-point flow control In commit 04d7b574b245 ("tipc: add multipoint-to-point flow control") we introduced a protocol for preventing buffer overflow when many group members try to simultaneously send messages to the same receiving member. Stress test of this mechanism has revealed a couple of related bugs: - When the receiving member receives an advertisement REMIT message from one of the senders, it will sometimes prematurely activate a pending member and send it the remitted advertisement, although the upper limit for active senders has been reached. This leads to accumulation of illegal advertisements, and eventually to messages being dropped because of receive buffer overflow. - When the receiving member leaves REMITTED state while a received message is being read, we miss to look at the pending queue, to activate the oldest pending peer. This leads to some pending senders being starved out, and never getting the opportunity to profit from the remitted advertisement. We fix the former in the function tipc_group_proto_rcv() by returning directly from the function once it becomes clear that the remitting peer cannot leave REMITTED state at that point. We fix the latter in the function tipc_group_update_rcv_win() by looking up and activate the longest pending peer when it becomes clear that the remitting peer now can leave REMITTED state. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 8e12ab55346b..5f4ffae807ee 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -109,7 +109,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { - if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || + m->state == MBR_REMITTED) grp->active_cnt--; } @@ -562,7 +563,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; - struct tipc_member *m, *rm; + struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) @@ -605,6 +606,17 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } + grp->active_cnt--; + list_del_init(&m->list); + if (list_empty(&grp->pending)) + return; + + /* Set oldest pending member to active and advertise */ + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_DISCOVERED: @@ -742,14 +754,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!m || m->state != MBR_RECLAIMING) return; - list_del_init(&m->list); - grp->active_cnt--; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; + m->advertised = ADV_IDLE + in_flight; + return; } /* All messages preceding the REMIT have been read */ if (m->advertised <= remitted) { @@ -761,6 +773,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); m->advertised = ADV_IDLE + in_flight; + grp->active_cnt--; + list_del_init(&m->list); /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) -- cgit v1.2.3 From c095508770aebf1b9218e77026e48345d719b17c Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Tue, 2 Jan 2018 19:44:34 +0000 Subject: RDS: Heap OOB write in rds_message_alloc_sgs() When args->nr_local is 0, nr_pages gets also 0 due some size calculation via rds_rm_size(), which is later used to allocate pages for DMA, this bug produces a heap Out-Of-Bound write access to a specific memory region. Signed-off-by: Mohamed Ghannam Signed-off-by: David S. Miller --- net/rds/rdma.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index bc2f1e0977d6..94729d9da437 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -525,6 +525,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + if (args->nr_local == 0) + return -EINVAL; + /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], -- cgit v1.2.3 From 79d0895140e937ba111e6420b4cd83ee75efa788 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 2 Jan 2018 19:44:37 -0200 Subject: sctp: fix error path in sctp_stream_init syzbot noticed a NULL pointer dereference panic in sctp_stream_free() which was caused by an incomplete error handling in sctp_stream_init(). By not clearing stream->outcnt, it made a for() in sctp_stream_free() think that it had elements to free, but not, leading to the panic. As suggested by Xin Long, this patch also simplifies the error path by moving it to the only if() that uses it. See-also: https://www.spinics.net/lists/netdev/msg473756.html See-also: https://www.spinics.net/lists/netdev/msg465024.html Reported-by: syzbot Fixes: f952be79cebd ("sctp: introduce struct sctp_stream_out_ext") Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 76ea66be0bbe..524dfeb94c41 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -156,9 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); - i = sctp_stream_alloc_out(stream, outcnt, gfp); - if (i) - return i; + ret = sctp_stream_alloc_out(stream, outcnt, gfp); + if (ret) + goto out; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) @@ -170,19 +170,17 @@ in: if (!incnt) goto out; - i = sctp_stream_alloc_in(stream, incnt, gfp); - if (i) { - ret = -ENOMEM; - goto free; + ret = sctp_stream_alloc_in(stream, incnt, gfp); + if (ret) { + sched->free(stream); + kfree(stream->out); + stream->out = NULL; + stream->outcnt = 0; + goto out; } stream->incnt = incnt; - goto out; -free: - sched->free(stream); - kfree(stream->out); - stream->out = NULL; out: return ret; } -- cgit v1.2.3 From 3ea15452ee85754f70f3b9fa1f23165ef2e77ba7 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 3 Jan 2018 11:00:31 +0800 Subject: nl80211: Check for the required netlink attribute presence nl80211_nan_add_func() does not check if the required attribute NL80211_NAN_FUNC_FOLLOW_UP_DEST is present when processing NL80211_CMD_ADD_NAN_FUNCTION request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attribute presence. Signed-off-by: Hao Chen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 213d0c498c97..2b3dbcd40e46 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11361,7 +11361,8 @@ static int nl80211_nan_add_func(struct sk_buff *skb, break; case NL80211_NAN_FUNC_FOLLOW_UP: if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] || - !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) { + !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] || + !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) { err = -EINVAL; goto out; } -- cgit v1.2.3 From 736a80bbfda709fb3631f5f62056f250a38e5804 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Jan 2018 15:51:53 +0100 Subject: mac80211: mesh: drop frames appearing to be from us If there are multiple mesh stations with the same MAC address, they will both get confused and start throwing warnings. Obviously in this case nothing can actually work anyway, so just drop frames that look like they're from ourselves early on. Reported-by: Gui Iribarren Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 70e9d2ca8bbe..4daafb07602f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3632,6 +3632,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) } return true; case NL80211_IFTYPE_MESH_POINT: + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return false; if (multicast) return true; return ether_addr_equal(sdata->vif.addr, hdr->addr1); -- cgit v1.2.3 From 06e7e776ca4d36547e503279aeff996cbb292c16 Mon Sep 17 00:00:00 2001 From: Ben Seri Date: Fri, 8 Dec 2017 15:14:47 +0100 Subject: Bluetooth: Prevent stack info leak from the EFS element. In the function l2cap_parse_conf_rsp and in the function l2cap_parse_conf_req the following variable is declared without initialization: struct l2cap_conf_efs efs; In addition, when parsing input configuration parameters in both of these functions, the switch case for handling EFS elements may skip the memcpy call that will write to the efs variable: ... case L2CAP_CONF_EFS: if (olen == sizeof(efs)) memcpy(&efs, (void *)val, olen); ... The olen in the above if is attacker controlled, and regardless of that if, in both of these functions the efs variable would eventually be added to the outgoing configuration request that is being built: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), (unsigned long) &efs); So by sending a configuration request, or response, that contains an L2CAP_CONF_EFS element, but with an element length that is not sizeof(efs) - the memcpy to the uninitialized efs variable can be avoided, and the uninitialized variable would be returned to the attacker (16 bytes). This issue has been assigned CVE-2017-1000410 Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Cc: stable Signed-off-by: Ben Seri Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/l2cap_core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 43ba91c440bc..fc6615d59165 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3363,9 +3363,10 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data break; case L2CAP_CONF_EFS: - remote_efs = 1; - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { + remote_efs = 1; memcpy(&efs, (void *) val, olen); + } break; case L2CAP_CONF_EWS: @@ -3584,16 +3585,17 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, break; case L2CAP_CONF_EFS: - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { memcpy(&efs, (void *)val, olen); - if (chan->local_stype != L2CAP_SERV_NOTRAFIC && - efs.stype != L2CAP_SERV_NOTRAFIC && - efs.stype != chan->local_stype) - return -ECONNREFUSED; + if (chan->local_stype != L2CAP_SERV_NOTRAFIC && + efs.stype != L2CAP_SERV_NOTRAFIC && + efs.stype != chan->local_stype) + return -ECONNREFUSED; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs, endptr - ptr); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), + (unsigned long) &efs, endptr - ptr); + } break; case L2CAP_CONF_FCS: -- cgit v1.2.3 From f428fe4a04cc339166c8bbd489789760de3a0cee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jan 2018 23:27:33 -0800 Subject: rtnetlink: give a user socket to get_target_net() This function is used from two places: rtnl_dump_ifinfo and rtnl_getlink. In rtnl_getlink(), we give a request skb into get_target_net(), but in rtnl_dump_ifinfo, we give a response skb into get_target_net(). The problem here is that NETLINK_CB() isn't initialized for the response skb. In both cases we can get a user socket and give it instead of skb into get_target_net(). This bug was found by syzkaller with this call-trace: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868 RSP: 0018:ffff8801c880f348 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8443f900 RDX: 000000000000007b RSI: ffffffff86510f40 RDI: 00000000000003d8 RBP: ffff8801c880f360 R08: 0000000000000000 R09: 1ffff10039101e4f R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86510f40 R13: 000000000000000c R14: 0000000000000004 R15: 0000000000000011 FS: 0000000001a1a880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020151000 CR3: 00000001c9511005 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: netlink_ns_capable+0x26/0x30 net/netlink/af_netlink.c:886 get_target_net+0x9d/0x120 net/core/rtnetlink.c:1765 rtnl_dump_ifinfo+0x2e5/0xee0 net/core/rtnetlink.c:1806 netlink_dump+0x48c/0xce0 net/netlink/af_netlink.c:2222 __netlink_dump_start+0x4f0/0x6d0 net/netlink/af_netlink.c:2319 netlink_dump_start include/linux/netlink.h:214 [inline] rtnetlink_rcv_msg+0x7f0/0xb10 net/core/rtnetlink.c:4485 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2441 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4540 netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline] netlink_unicast+0x4be/0x6a0 net/netlink/af_netlink.c:1334 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897 Cc: Jiri Benc Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface") Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..778d7f03404a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1681,18 +1681,18 @@ static bool link_dump_filtered(struct net_device *dev, return false; } -static struct net *get_target_net(struct sk_buff *skb, int netnsid) +static struct net *get_target_net(struct sock *sk, int netnsid) { struct net *net; - net = get_net_ns_by_id(sock_net(skb->sk), netnsid); + net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } @@ -1733,7 +1733,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ifla_policy, NULL) >= 0) { if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(skb->sk, netnsid); if (IS_ERR(tgt_net)) { tgt_net = net; netnsid = -1; @@ -2883,7 +2883,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } -- cgit v1.2.3 From 7d11f77f84b27cef452cee332f4e469503084737 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Wed, 3 Jan 2018 21:06:06 +0000 Subject: RDS: null pointer dereference in rds_atomic_free_op set rm->atomic.op_active to 0 when rds_pin_pages() fails or the user supplied address is invalid, this prevents a NULL pointer usage in rds_atomic_free_op() Signed-off-by: Mohamed Ghannam Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 94729d9da437..634cfcb7bba6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -877,6 +877,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, err: if (page) put_page(page); + rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; -- cgit v1.2.3 From 7bbfe00e025240505db3e04c3b296d7c023b2a26 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 3 Jan 2018 14:11:59 -0800 Subject: ipv6: fix general protection fault in fib6_add() In fib6_add(), pn could be NULL if fib6_add_1() failed to return a fib6 node. Checking pn != fn before accessing pn->leaf makes sure pn is not NULL. This fixes the following GPF reported by syzkaller: general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 3201 Comm: syzkaller001778 Not tainted 4.15.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fib6_add+0x736/0x15a0 net/ipv6/ip6_fib.c:1244 RSP: 0018:ffff8801c7626a70 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffffff84794465 RDX: 0000000000000004 RSI: ffff8801d38935f0 RDI: 0000000000000282 RBP: ffff8801c7626da0 R08: 1ffff10038ec4c35 R09: 0000000000000000 R10: ffff8801c7626c68 R11: 0000000000000000 R12: 00000000fffffffe R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000009 FS: 0000000000000000(0000) GS:ffff8801db200000(0063) knlGS:0000000009b70840 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000020be1000 CR3: 00000001d585a006 CR4: 00000000001606f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1006 ip6_route_multipath_add+0xd14/0x16c0 net/ipv6/route.c:3833 inet6_rtm_newroute+0xdc/0x160 net/ipv6/route.c:3957 rtnetlink_rcv_msg+0x733/0x1020 net/core/rtnetlink.c:4411 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4423 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864 sock_sendmsg_nosec net/socket.c:636 [inline] sock_sendmsg+0xca/0x110 net/socket.c:646 sock_write_iter+0x31a/0x5d0 net/socket.c:915 call_write_iter include/linux/fs.h:1772 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 compat_writev+0x225/0x420 fs/read_write.c:1246 do_compat_writev+0x115/0x220 fs/read_write.c:1267 C_SYSC_writev fs/read_write.c:1278 [inline] compat_SyS_writev+0x26/0x30 fs/read_write.c:1274 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:125 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f5285f4e1d08..d11a5578e4f8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1241,23 +1241,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; -- cgit v1.2.3 From d16b46e4fd8bc6063624605f25b8c0835bb1fbe3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 4 Jan 2018 22:25:07 +1100 Subject: xfrm: Use __skb_queue_tail in xfrm_trans_queue We do not need locking in xfrm_trans_queue because it is designed to use per-CPU buffers. However, the original code incorrectly used skb_queue_tail which takes the lock. This patch switches it to __skb_queue_tail instead. Reported-and-tested-by: Artem Savkov Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets...") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 3f6f6f8c9fa5..5b2409746ae0 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -518,7 +518,7 @@ int xfrm_trans_queue(struct sk_buff *skb, return -ENOBUFS; XFRM_TRANS_SKB_CB(skb)->finish = finish; - skb_queue_tail(&trans->queue, skb); + __skb_queue_tail(&trans->queue, skb); tasklet_schedule(&trans->tasklet); return 0; } -- cgit v1.2.3 From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Dec 2017 20:20:38 -0500 Subject: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'" Descriptor table is a shared object; it's not a place where you can stick temporary references to files, especially when we don't need an opened file at all. Cc: stable@vger.kernel.org # v4.14 Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'") Signed-off-by: Al Viro --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/inode.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 2 +- net/netfilter/xt_bpf.c | 14 ++------------ 4 files changed, 52 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..b63a592ad29d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -419,6 +419,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) attr->numa_node : NUMA_NO_NODE; } +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -506,6 +508,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, { return 0; } + +static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, + enum bpf_prog_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -514,6 +522,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, return bpf_prog_get_type_dev(ufd, type, false); } +bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..fa2ca0a13619 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -52,18 +52,8 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) { - mm_segment_t oldfs = get_fs(); - int retval, fd; - - set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path, 0); - set_fs(oldfs); - if (fd < 0) - return fd; - - retval = __bpf_mt_check_fd(fd, ret); - sys_close(fd); - return retval; + *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); + return PTR_ERR_OR_ZERO(*ret); } static int bpf_mt_check(const struct xt_mtchk_param *par) -- cgit v1.2.3 From bcfd09f7837f5240c30fd2f52ee7293516641faa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 5 Jan 2018 22:12:32 +1100 Subject: xfrm: Return error on unknown encap_type in init_state Currently esp will happily create an xfrm state with an unknown encap type for IPv4, without setting the necessary state parameters. This patch fixes it by returning -EINVAL. There is a similar problem in IPv6 where if the mode is unknown we will skip initialisation while returning zero. However, this is harmless as the mode has already been checked further up the stack. This patch removes this anomaly by aligning the IPv6 behaviour with IPv4 and treating unknown modes (which cannot actually happen) as transport mode. Fixes: 38320c70d282 ("[IPSEC]: Use crypto_aead and authenc in ESP") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 1 + net/ipv6/esp6.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d57aa64fa7c7..61fe6e4d23fc 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -981,6 +981,7 @@ static int esp_init_state(struct xfrm_state *x) switch (encap->encap_type) { default: + err = -EINVAL; goto error; case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a902ff8f59be..1a7f00cd4803 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -890,13 +890,12 @@ static int esp6_init_state(struct xfrm_state *x) x->props.header_len += IPV4_BEET_PHMAXLEN + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; + default: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; - default: - goto error; } align = ALIGN(crypto_aead_blocksize(aead), 4); -- cgit v1.2.3 From b1bdcb59b64f806ef08d25a85c39ffb3ad841ce6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 6 Jan 2018 01:13:08 +0100 Subject: xfrm: don't call xfrm_policy_cache_flush while holding spinlock xfrm_policy_cache_flush can sleep, so it cannot be called while holding a spinlock. We could release the lock first, but I don't see why we need to invoke this function here in first place, the packet path won't reuse an xdst entry unless its still valid. While at it, add an annotation to xfrm_policy_cache_flush, it would have probably caught this bug sooner. Fixes: ec30d78c14a813 ("xfrm: add xdst pcpu cache") Reported-by: syzbot+e149f7d1328c26f9c12f@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2ef6db98e9ba..bc5eae12fb09 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -975,8 +975,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) } if (!cnt) err = -ESRCH; - else - xfrm_policy_cache_flush(); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; @@ -1744,6 +1742,8 @@ void xfrm_policy_cache_flush(void) bool found = 0; int cpu; + might_sleep(); + local_bh_disable(); rcu_read_lock(); for_each_possible_cpu(cpu) { -- cgit v1.2.3 From cc35c3d1edf7a8373a1a5daa80a912dec96a9cd5 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 5 Jan 2018 11:17:17 -0200 Subject: sctp: do not retransmit upon FragNeeded if PMTU discovery is disabled Currently, if PMTU discovery is disabled on a given transport, but the configured value is higher than the actual PMTU, it is likely that we will get some icmp Frag Needed. The issue is, if PMTU discovery is disabled, we won't update the information and will issue a retransmission immediately, which may very well trigger another ICMP, and another retransmission, leading to a loop. The fix is to simply not trigger immediate retransmissions if PMTU discovery is disabled on the given transport. Changes from v2: - updated stale comment, noticed by Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 621b5ca3fd1c..9320661cc41d 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -399,20 +399,20 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; } - if (t->param_flags & SPP_PMTUD_ENABLE) { - /* Update transports view of the MTU */ - sctp_transport_update_pmtu(t, pmtu); + if (!(t->param_flags & SPP_PMTUD_ENABLE)) + /* We can't allow retransmitting in such case, as the + * retransmission would be sized just as before, and thus we + * would get another icmp, and retransmit again. + */ + return; - /* Update association pmtu. */ - sctp_assoc_sync_pmtu(asoc); - } + /* Update transports view of the MTU */ + sctp_transport_update_pmtu(t, pmtu); - /* Retransmit with the new pmtu setting. - * Normally, if PMTU discovery is disabled, an ICMP Fragmentation - * Needed will never be sent, but if a message was sent before - * PMTU discovery was disabled that was larger than the PMTU, it - * would not be fragmented, so it must be re-transmitted fragmented. - */ + /* Update association pmtu. */ + sctp_assoc_sync_pmtu(asoc); + + /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } -- cgit v1.2.3 From b6c5734db07079c9410147b32407f2366d584e6c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 5 Jan 2018 11:17:18 -0200 Subject: sctp: fix the handling of ICMP Frag Needed for too small MTUs syzbot reported a hang involving SCTP, on which it kept flooding dmesg with the message: [ 246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too low, using default minimum of 512 That happened because whenever SCTP hits an ICMP Frag Needed, it tries to adjust to the new MTU and triggers an immediate retransmission. But it didn't consider the fact that MTUs smaller than the SCTP minimum MTU allowed (512) would not cause the PMTU to change, and issued the retransmission anyway (thus leading to another ICMP Frag Needed, and so on). As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU are higher than that, sctp_transport_update_pmtu() is changed to re-fetch the PMTU that got set after our request, and with that, detect if there was an actual change or not. The fix, thus, skips the immediate retransmission if the received ICMP resulted in no change, in the hope that SCTP will select another path. Note: The value being used for the minimum MTU (512, SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576, SCTP_MIN_PMTU), but such change belongs to another patch. Changes from v1: - do not disable PMTU discovery, in the light of commit 06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small") and as suggested by Xin Long. - changed the way to break the rtx loop by detecting if the icmp resulted in a change or not Changes from v2: none See-also: https://lkml.org/lkml/2017/12/22/811 Reported-by: syzbot Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/input.c | 8 ++++++-- net/sctp/transport.c | 29 +++++++++++++++++++---------- 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2f8f93da5dc2..9a5ccf03a59b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -966,7 +966,7 @@ void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *t); -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); diff --git a/net/sctp/input.c b/net/sctp/input.c index 9320661cc41d..141c9c466ec1 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -406,8 +406,12 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, */ return; - /* Update transports view of the MTU */ - sctp_transport_update_pmtu(t, pmtu); + /* Update transports view of the MTU. Return if no update was needed. + * If an update wasn't needed/possible, it also doesn't make sense to + * try to retransmit now. + */ + if (!sctp_transport_update_pmtu(t, pmtu)) + return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 1e5a22430cf5..47f82bd794d9 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -248,28 +248,37 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", - __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); - /* Use default minimum segment size and disable - * pmtu discovery on this transport. - */ - t->pathmtu = SCTP_DEFAULT_MINSEGMENT; - } else { - t->pathmtu = pmtu; + pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment instead */ + pmtu = SCTP_DEFAULT_MINSEGMENT; } + pmtu = SCTP_TRUNC4(pmtu); if (dst) { dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); dst = sctp_transport_dst_check(t); } - if (!dst) + if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + dst = t->dst; + } + + if (dst) { + /* Re-fetch, as under layers may have a higher minimum size */ + pmtu = SCTP_TRUNC4(dst_mtu(dst)); + change = t->pathmtu != pmtu; + } + t->pathmtu = pmtu; + + return change; } /* Caches the dst entry and source address for a transport's destination -- cgit v1.2.3 From 374d1b5a81f7f9cc5e7f095ac3d5aff3f6600376 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 5 Jan 2018 08:35:47 +0100 Subject: esp: Fix GRO when the headers not fully in the linear part of the skb. The GRO layer does not necessarily pull the complete headers into the linear part of the skb, a part may remain on the first page fragment. This can lead to a crash if we try to pull the headers, so make sure we have them on the linear part before pulling. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Reported-by: syzbot+82bbd65569c49c6c0c4d@syzkaller.appspotmail.com Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 ++- net/ipv6/esp6_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index f8b918c766b0..b1338e576d00 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -38,7 +38,8 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, __be32 spi; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 333a478aa161..dd9627490c7c 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -60,7 +60,8 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, int nhoff; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; -- cgit v1.2.3 From b8fd0823e0770c2d5fdbd865bccf0d5e058e5287 Mon Sep 17 00:00:00 2001 From: Andrii Vladyka Date: Thu, 4 Jan 2018 13:09:17 +0200 Subject: net: core: fix module type in sock_diag_bind Use AF_INET6 instead of AF_INET in IPv6-related code path Signed-off-by: Andrii Vladyka Signed-off-by: David S. Miller --- net/core/sock_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 217f4e3b82f6..146b50e30659 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -288,7 +288,7 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + NETLINK_SOCK_DIAG, AF_INET6); break; } return 0; -- cgit v1.2.3 From 3dc2fa47549aca71773afdd12a78d31802bb22b4 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Mon, 8 Jan 2018 19:43:00 +0800 Subject: net: caif: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_dev.c: In function 'caif_enroll_dev': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfctrl.c: In function 'cfctrl_linkup_request': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfcnfg.c: In function 'caif_connect_client': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 5 ++--- net/caif/cfcnfg.c | 10 ++++------ net/caif/cfctrl.c | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 2d38b6e34203..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -334,9 +334,8 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); - strncpy(caifd->layer.name, dev->name, - sizeof(caifd->layer.name) - 1); - caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + strlcpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; cfcnfg_add_phy_layer(cfg, dev, diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 273cb07f57d8..8f00bea093b9 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -268,17 +268,15 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; - strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, - sizeof(l->u.rfm.volume)-1); - l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0; + strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, + sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; - strncpy(l->u.utility.name, s->sockaddr.u.util.service, - sizeof(l->u.utility.name)-1); - l->u.utility.name[sizeof(l->u.utility.name)-1] = 0; + strlcpy(l->u.utility.name, s->sockaddr.u.util.service, + sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index f5afda1abc76..655ed7032150 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -258,8 +258,8 @@ int cfctrl_linkup_request(struct cflayer *layer, tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); memset(utility_name, 0, sizeof(utility_name)); - strncpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH - 1); + strlcpy(utility_name, param->u.utility.name, + UTILITY_NAME_LENGTH); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); -- cgit v1.2.3 From 20b50d79974ea3192e8c3ab7faf4e536e5f14d8f Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 8 Jan 2018 15:54:44 +0100 Subject: net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field in raw_sendmsg() Commit 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") fixed the issue of possibly inconsistent ->hdrincl handling due to concurrent updates by reading this bit-field member into a local variable and using the thus stabilized value in subsequent tests. However, aforementioned commit also adds the (correct) comment that /* hdrincl should be READ_ONCE(inet->hdrincl) * but READ_ONCE() doesn't work with bit fields */ because as it stands, the compiler is free to shortcut or even eliminate the local variable at its will. Note that I have not seen anything like this happening in reality and thus, the concern is a theoretical one. However, in order to be on the safe side, emulate a READ_ONCE() on the bit-field by doing it on the local 'hdrincl' variable itself: int hdrincl = inet->hdrincl; hdrincl = READ_ONCE(hdrincl); This breaks the chain in the sense that the compiler is not allowed to replace subsequent reads from hdrincl with reloads from inet->hdrincl. Fixes: 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") Signed-off-by: Nicolai Stange Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/raw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 125c1eab3eaa..5e570aa9e43b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -520,9 +520,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. */ hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); /* * Check the flags. */ -- cgit v1.2.3 From 4512c43eac7e007d982e7ea45152ea6f3f4d1921 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Jan 2018 10:34:00 -0800 Subject: ipv6: remove null_entry before adding default route In the current code, when creating a new fib6 table, tb6_root.leaf gets initialized to net->ipv6.ip6_null_entry. If a default route is being added with rt->rt6i_metric = 0xffffffff, fib6_add() will add this route after net->ipv6.ip6_null_entry. As null_entry is shared, it could cause problem. In order to fix it, set fn->leaf to NULL before calling fib6_add_rt2node() when trying to add the first default route. And reset fn->leaf to null_entry when adding fails or when deleting the last default route. syzkaller reported the following issue which is fixed by this commit: WARNING: suspicious RCU usage 4.15.0-rc5+ #171 Not tainted ----------------------------- net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 4 locks held by swapper/0/0: #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310 #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007 #2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560 #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline] #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585 fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701 fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892 fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815 fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863 fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933 __fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949 fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline] fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016 fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033 call_timer_fn+0x228/0x820 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d11a5578e4f8..9dcc3924a975 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -640,6 +640,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -1270,13 +1275,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1531,6 +1540,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1685,10 +1700,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } -- cgit v1.2.3 From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- init/Kconfig | 7 +++++++ kernel/bpf/core.c | 19 +++++++++++++++++++ lib/test_bpf.c | 11 +++++++---- net/core/filter.c | 6 ++---- net/core/sysctl_net_core.c | 6 ++++++ net/socket.c | 9 +++++++++ 6 files changed, 50 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/init/Kconfig b/init/Kconfig index 2934249fba46..5e2a4a391ba9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1392,6 +1392,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..51ec2dda7f08 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9e9748089270..f369889e521d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6250,9 +6250,8 @@ static struct bpf_prog *generate_filter(int which, int *err) return NULL; } } - /* We don't expect to fail. */ if (*err) { - pr_cont("FAIL to attach err=%d len=%d\n", + pr_cont("FAIL to prog_create err=%d len=%d\n", *err, fprog.len); return NULL; } @@ -6276,6 +6275,10 @@ static struct bpf_prog *generate_filter(int which, int *err) * checks. */ fp = bpf_prog_select_runtime(fp, err); + if (*err) { + pr_cont("FAIL to select_runtime err=%d\n", *err); + return NULL; + } break; } @@ -6461,8 +6464,8 @@ static __init int test_bpf(void) pass_cnt++; continue; } - - return err; + err_cnt++; + continue; } pr_cont("jited:%u ", fp->jited); diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..d339ef170df6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1054,11 +1054,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cbc3dde4cfcc..a47ad6cd41c0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -325,7 +325,13 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, +#ifndef CONFIG_BPF_JIT_ALWAYS_ON .proc_handler = proc_dointvec +#else + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, +#endif }, # ifdef CONFIG_HAVE_EBPF_JIT { diff --git a/net/socket.c b/net/socket.c index 05f361faec45..78acd6ce74c7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2619,6 +2619,15 @@ out_fs: core_initcall(sock_init); /* early initcall */ +static int __init jit_init(void) +{ +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_jit_enable = 1; +#endif + return 0; +} +pure_initcall(jit_init); + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { -- cgit v1.2.3 From 1e532d2b49645e7cb76d5af6cb5bc4ec93d861ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 10 Jan 2018 09:33:26 +0100 Subject: af_key: Fix memory leak in key_notify_policy. We leak the allocated out_skb in case pfkey_xfrm_policy2msg() fails. Fix this by freeing it on error. Reported-by: Dmitry Vyukov Signed-off-by: Steffen Klassert --- net/key/af_key.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index d40861a048fe..7e2e7188e7f4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2202,8 +2202,10 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); - if (err < 0) + if (err < 0) { + kfree_skb(out_skb); return err; + } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; -- cgit v1.2.3 From 76a4201191814a0061cb5c861fafb9ecaa764846 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 10 Jan 2018 12:14:28 +0100 Subject: xfrm: Fix a race in the xdst pcpu cache. We need to run xfrm_resolve_and_create_bundle() with bottom halves off. Otherwise we may reuse an already released dst_enty when the xfrm lookup functions are called from process context. Fixes: c30d78c14a813db39a647b6a348b428 ("xfrm: add xdst pcpu cache") Reported-by: Darius Ski Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index bc5eae12fb09..bd6b0e7a0ee4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2063,8 +2063,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (num_xfrms <= 0) goto make_dummy_bundle; + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, - xflo->dst_orig); + xflo->dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err != -EAGAIN) @@ -2151,9 +2154,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); -- cgit v1.2.3 From 2e83acb970684008baee471427270c029a76ddbd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:27 -0200 Subject: sctp: GFP_ATOMIC is not needed in sctp_setsockopt_events So replace it with GFP_USER and also add __GFP_NOWARN. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b4fb6e4886d2..54c046783a89 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2277,7 +2277,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, - GFP_ATOMIC); + GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; -- cgit v1.2.3 From 5960cefab9df76600a1a7d4ff592c59e14616e88 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:28 -0200 Subject: sctp: add a ceiling to optlen in some sockopts Hangbin Liu reported that some sockopt calls could cause the kernel to log a warning on memory allocation failure if the user supplied a large optlen value. That is because some of them called memdup_user() without a ceiling on optlen, allowing it to try to allocate really large buffers. This patch adds a ceiling by limiting optlen to the maximum allowed that would still make sense for these sockopt. Reported-by: Hangbin Liu Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 54c046783a89..022b94f11fd8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; + optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + + SCTP_AUTH_NUM_HMACS * sizeof(u16)); hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) @@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk, if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; + /* authkey->sca_keylength is u16, so optlen can't be bigger than + * this. + */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(struct sctp_authkey)); authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) @@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, if (optlen < sizeof(*params)) return -EINVAL; + /* srs_number_streams is u16, so optlen can't be bigger than this. */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(__u16) * sizeof(*params)); params = memdup_user(optval, optlen); if (IS_ERR(params)) -- cgit v1.2.3 From c76f97c99ae6d26d14c7f0e50e074382bfbc9f98 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:29 -0200 Subject: sctp: make use of pre-calculated len Some sockopt handling functions were calculating the length of the buffer to be written to userspace and then calculating it again when actually writing the buffer, which could lead to some write not using an up-to-date length. This patch updates such places to just make use of the len variable. Also, replace some sizeof(type) to sizeof(var). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 022b94f11fd8..9b01e994f661 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5025,7 +5025,7 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int))) + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len)) return -EFAULT; return 0; } @@ -5655,6 +5655,9 @@ copy_getaddrs: err = -EFAULT; goto out; } + /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, + * but we can't change it anymore. + */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: @@ -6091,7 +6094,7 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); - if (copy_from_user(¶ms, optval, sizeof(params))) + if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; @@ -6261,7 +6264,9 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid))) + + len = sizeof(struct sctp_authkeyid); + if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); @@ -6273,7 +6278,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, else val.scact_keynumber = ep->active_key_id; - len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) @@ -6299,7 +6303,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; @@ -6344,7 +6348,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; -- cgit v1.2.3 From b0d55b5bc77755501be9de2c935d106ff8dba9ac Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 9 Jan 2018 19:58:18 +0800 Subject: caif_usb: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_usb.c: In function 'cfusbl_device_notify': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5cd44f001f64..1a082a946045 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -176,9 +176,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, dev_add_pack(&caif_usb_type); pack_added = true; - strncpy(layer->name, dev->name, - sizeof(layer->name) - 1); - layer->name[sizeof(layer->name) - 1] = 0; + strlcpy(layer->name, dev->name, sizeof(layer->name)); return 0; } -- cgit v1.2.3 From 78bbb15f2239bc8e663aa20bbe1987c91a0b75f6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 9 Jan 2018 13:40:41 -0800 Subject: 8021q: fix a memory leak for VLAN 0 device A vlan device with vid 0 is allow to creat by not able to be fully cleaned up by unregister_vlan_dev() which checks for vlan_id!=0. Also, VLAN 0 is probably not a valid number and it is kinda "reserved" for HW accelerating devices, but it is probably too late to reject it from creation even if makes sense. Instead, just remove the check in unregister_vlan_dev(). Reported-by: Dmitry Vyukov Fixes: ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)") Cc: Vlad Yasevich Cc: Ben Hutchings Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8dfdd94e430f..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -111,12 +111,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); } - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); -- cgit v1.2.3 From 862c03ee1deb7e19e0f9931682e0294ecd1fcaf9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jan 2018 03:45:49 -0800 Subject: ipv6: fix possible mem leaks in ipv6_make_skb() ip6_setup_cork() might return an error, while memory allocations have been done and must be rolled back. Fixes: 6422398c2ab0 ("ipv6: introduce ipv6_make_skb") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Reported-by: Mike Maloney Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7dd51c42314..688ba5f7516b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1735,9 +1735,10 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); - if (err) + if (err) { + ip6_cork_release(&cork, &v6_cork); return ERR_PTR(err); - + } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; -- cgit v1.2.3 From ccc12b11c5332c84442ef120dcd631523be75089 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Wed, 10 Jan 2018 13:35:49 +0000 Subject: ipv6: sr: fix TLVs not being copied using setsockopt Function ipv6_push_rthdr4 allows to add an IPv6 Segment Routing Header to a socket through setsockopt, but the current implementation doesn't copy possible TLVs at the end of the SRH received from userspace. Therefore, the execution of the following branch if (sr_has_hmac(sr_phdr)) { ... } will never complete since the len and type fields of a possible HMAC TLV are not copied, hence seg6_get_tlv_hmac will return an error, and the HMAC will not be computed. This commit adds a memcpy in case TLVs have been appended to the SRH. Fixes: a149e7c7ce81 ("ipv6: sr: add support for SRH injection through setsockopt") Acked-by: David Lebrun Signed-off-by: Mathieu Xhonneux Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 83bd75713535..bc68eb661970 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; + if (sr_ihdr->hdrlen > hops * 2) { + int tlvs_offset, tlvs_length; + + tlvs_offset = (1 + hops * 2) << 3; + tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; + memcpy((char *)sr_phdr + tlvs_offset, + (char *)sr_ihdr + tlvs_offset, tlvs_length); + } + #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; -- cgit v1.2.3 From ce4bb04cae8924792ed92f4af2793b77fc986f0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Jan 2018 18:47:05 -0500 Subject: Fix a leak in socket(2) when we fail to allocate a file descriptor. Got broken by "make sock_alloc_file() do sock_release() on failures" - cleanup after sock_map_fd() failure got pulled all the way into sock_alloc_file(), but it used to serve the case when sock_map_fd() failed *before* getting to sock_alloc_file() as well, and that got lost. Trivial to fix, fortunately. Fixes: 8e1611e23579 (make sock_alloc_file() do sock_release() on failures) Reported-by: Dmitry Vyukov Signed-off-by: Al Viro --- net/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..82433a2200ec 100644 --- a/net/socket.c +++ b/net/socket.c @@ -432,8 +432,10 @@ static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) + if (unlikely(fd < 0)) { + sock_release(sock); return fd; + } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { -- cgit v1.2.3 From 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jan 2018 18:59:52 -0800 Subject: bpf: fix 32-bit divide by zero due to some JITs doing if (src_reg == 0) check in 64-bit mode for div/mod operations mask upper 32-bits of src register before doing the check Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 4 ++++ 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20eb04fd155e..b7448347e6b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4445,6 +4445,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; diff --git a/net/core/filter.c b/net/core/filter.c index d339ef170df6..1c0eb436671f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -458,6 +458,10 @@ do_pass: convert_bpf_extensions(fp, &insn)) break; + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; -- cgit v1.2.3 From a48a52b7bea81c046fe1c1288f84d0eba214cba0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:12:05 +0100 Subject: cfg80211: fully initialize old channel for event Paul reported that he got a report about undefined behaviour that seems to me to originate in using uninitialized memory when the channel structure here is used in the event code in nl80211 later. He never reported whether this fixed it, and I wasn't able to trigger this so far, but we should do the right thing and fully initialize the on-stack structure anyway. Reported-by: Paul Menzel Signed-off-by: Johannes Berg --- net/wireless/reg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 78e71b0390be..7b42f0bacfd8 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1769,8 +1769,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS) return; - chan_before.center_freq = chan->center_freq; - chan_before.flags = chan->flags; + chan_before = *chan; if (chan->flags & IEEE80211_CHAN_NO_IR) { chan->flags &= ~IEEE80211_CHAN_NO_IR; -- cgit v1.2.3 From 7a94b8c2eee7083ddccd0515830f8c81a8e44b1a Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 15 Jan 2018 08:12:15 +0100 Subject: nl80211: take RCU read lock when calling ieee80211_bss_get_ie() As ieee80211_bss_get_ie() derefences an RCU to return ssid_ie, both the call to this function and any operation on this variable need protection by the RCU read lock. Fixes: 44905265bc15 ("nl80211: don't expose wdev->ssid for most interfaces") Signed-off-by: Dominik Brodowski Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2b3dbcd40e46..ed87a97fcb0b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2618,12 +2618,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag const u8 *ssid_ie; if (!wdev->current_bss) break; + rcu_read_lock(); ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, WLAN_EID_SSID); - if (!ssid_ie) - break; - if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure_locked; + if (ssid_ie && + nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) + goto nla_put_failure_rcu_locked; + rcu_read_unlock(); break; } default: @@ -2635,6 +2636,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag genlmsg_end(msg, hdr); return 0; + nla_put_failure_rcu_locked: + rcu_read_unlock(); nla_put_failure_locked: wdev_unlock(wdev); nla_put_failure: -- cgit v1.2.3 From 51a1aaa631c90223888d8beac4d649dc11d2ca55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:32:36 +0100 Subject: mac80211_hwsim: validate number of different channels When creating a new radio on the fly, hwsim allows this to be done with an arbitrary number of channels, but cfg80211 only supports a limited number of simultaneous channels, leading to a warning. Fix this by validating the number - this requires moving the define for the maximum out to a visible header file. Reported-by: syzbot+8dd9051ff19940290931@syzkaller.appspotmail.com Fixes: b59ec8dd4394 ("mac80211_hwsim: fix number of channels in interface combinations") Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 5 +++++ include/net/cfg80211.h | 2 ++ net/wireless/core.h | 2 -- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index ccd573e53c92..f6d4a50f1bdb 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3121,6 +3121,11 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) if (info->attrs[HWSIM_ATTR_CHANNELS]) param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]); + if (param.channels > CFG80211_MAX_NUM_DIFFERENT_CHANNELS) { + GENL_SET_ERR_MSG(info, "too many channels specified"); + return -EINVAL; + } + if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb4d92b79cd9..fb94a8bd8ab5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -815,6 +815,8 @@ struct cfg80211_csa_settings { u8 count; }; +#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 + /** * struct iface_combination_params - input parameters for interface combinations * diff --git a/net/wireless/core.h b/net/wireless/core.h index d2f7e8b8a097..eaff636169c2 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -507,8 +507,6 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 - #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else -- cgit v1.2.3 From 59b179b48ce2a6076448a44531242ac2b3f6cef2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:58:27 +0100 Subject: cfg80211: check dev_set_name() return value syzbot reported a warning from rfkill_alloc(), and after a while I think that the reason is that it was doing fault injection and the dev_set_name() failed, leaving the name NULL, and we didn't check the return value and got to rfkill_alloc() with a NULL name. Since we really don't want a NULL name, we ought to check the return value. Fixes: fb28ad35906a ("net: struct device - replace bus_id with dev_name(), dev_set_name()") Reported-by: syzbot+1ddfb3357e1d7bb5b5d3@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/wireless/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index fdde0d98fde1..a6f3cac8c640 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -439,6 +439,8 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, if (rv) goto use_default_name; } else { + int rv; + use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could @@ -446,7 +448,11 @@ use_default_name: * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ - dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + if (rv < 0) { + kfree(rdev); + return NULL; + } } INIT_LIST_HEAD(&rdev->wiphy.wdev_list); -- cgit v1.2.3 From d542296a4d0d9f41d0186edcac2baba1b674d02f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 8 Jan 2018 08:23:18 -0800 Subject: 9p: add missing module license for xen transport The 9P of Xen module is missing required license and module information. See https://bugzilla.kernel.org/show_bug.cgi?id=198109 Reported-by: Alan Bartlett Fixes: 868eb122739a ("xen/9pfs: introduce Xen 9pfs transport driver") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/9p/trans_xen.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 325c56043007..086a4abdfa7c 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -543,3 +543,7 @@ static void p9_trans_xen_exit(void) return xenbus_unregister_driver(&xen_9pfs_front_driver); } module_exit(p9_trans_xen_exit); + +MODULE_AUTHOR("Stefano Stabellini "); +MODULE_DESCRIPTION("Xen Transport for 9P"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 749439bfac6e1a2932c582e2699f91d329658196 Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Wed, 10 Jan 2018 12:45:10 -0500 Subject: ipv6: fix udpv6 sendmsg crash caused by too small MTU The logic in __ip6_append_data() assumes that the MTU is at least large enough for the headers. A device's MTU may be adjusted after being added while sendmsg() is processing data, resulting in __ip6_append_data() seeing any MTU. For an mtu smaller than the size of the fragmentation header, the math results in a negative 'maxfraglen', which causes problems when refragmenting any previous skb in the skb_write_queue, leaving it possibly malformed. Instead sendmsg returns EINVAL when the mtu is calculated to be less than IPV6_MIN_MTU. Found by syzkaller: kernel BUG at ./include/linux/skbuff.h:2064! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d0b68580 task.stack: ffff8801ac6b8000 RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline] RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216 RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000 RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0 RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000 R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8 R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000 FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_finish_skb include/net/ipv6.h:911 [inline] udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093 udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x352/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9 RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005 RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69 R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000 Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570 RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570 Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 688ba5f7516b..8fe58a2d305c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1206,14 +1206,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->base.flags |= IPCORK_ALLFRAG; -- cgit v1.2.3 From 59b36613e85fb16ebf9feaf914570879cd5c2a21 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 10 Jan 2018 12:50:25 -0800 Subject: tipc: fix a memory leak in tipc_nl_node_get_link() When tipc_node_find_by_name() fails, the nlmsg is not freed. While on it, switch to a goto label to properly free it. Fixes: be9c086715c ("tipc: narrow down exposure of struct tipc_node") Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 507017fe0f1b..9036d8756e73 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1880,36 +1880,38 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); - if (!node) - return -EINVAL; + if (!node) { + err = -EINVAL; + goto err_free; + } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); - nlmsg_free(msg.skb); - return -EINVAL; + err = -EINVAL; + goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } return genlmsg_reply(msg.skb, info); + +err_free: + nlmsg_free(msg.skb); + return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From cbbdf8433a5f117b1a2119ea30fc651b61ef7570 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 10 Jan 2018 13:00:39 -0800 Subject: netlink: extack needs to be reset each time through loop syzbot triggered the WARN_ON in netlink_ack testing the bad_attr value. The problem is that netlink_rcv_skb loops over the skb repeatedly invoking the callback and without resetting the extack leaving potentially stale data. Initializing each time through avoids the WARN_ON. Fixes: 2d4bc93368f5a ("netlink: extended ACK reporting") Reported-by: syzbot+315fa6766d0f7c359327@syzkaller.appspotmail.com Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 79cc1bf36e4a..47ef2d8683d6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2384,7 +2384,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { - struct netlink_ext_ack extack = {}; + struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; @@ -2405,6 +2405,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; + memset(&extack, 0, sizeof(extack)); err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; -- cgit v1.2.3 From 6503a30440962f1e1ccb8868816b4e18201218d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 11 Jan 2018 18:36:26 +0900 Subject: net: ipv4: Make "ip route get" match iif lo rules again. Commit 3765d35ed8b9 ("net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup") broke "ip route get" in the presence of rules that specify iif lo. Host-originated traffic always has iif lo, because ip_route_output_key_hash and ip6_route_output_flags set the flow iif to LOOPBACK_IFINDEX. Thus, putting "iif lo" in an ip rule is a convenient way to select only originated traffic and not forwarded traffic. inet_rtm_getroute used to match these rules correctly because even though it sets the flow iif to 0, it called ip_route_output_key which overwrites iif with LOOPBACK_IFINDEX. But now that it calls ip_route_output_key_hash_rcu, the ifindex will remain 0 and not match the iif lo in the rule. As a result, "ip route get" will return ENETUNREACH. Fixes: 3765d35ed8b9 ("net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup") Tested: https://android.googlesource.com/kernel/tests/+/master/net/test/multinetwork_test.py passes again Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 43b69af242e1..4e153b23bcec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2762,6 +2762,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { + fl4.flowi4_iif = LOOPBACK_IFINDEX; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) -- cgit v1.2.3 From 37f47bc90c7481e7959703ad1defc4fc9f5d85e3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 11 Jan 2018 14:22:06 -0200 Subject: sctp: avoid compiler warning on implicit fallthru These fall-through are expected. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 1 + net/sctp/outqueue.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3b18085e3b10..5d4c15bf66d2 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -826,6 +826,7 @@ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) case AF_INET: if (!__ipv6_only_sock(sctp_opt2sk(sp))) return 1; + /* fallthru */ default: return 0; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7d67feeeffc1..c4ec99b20150 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -918,9 +918,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) break; case SCTP_CID_ABORT: - if (sctp_test_T_bit(chunk)) { + if (sctp_test_T_bit(chunk)) packet->vtag = asoc->c.my_vtag; - } + /* fallthru */ /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can -- cgit v1.2.3 From 95ef498d977bf44ac094778fd448b98af158a3e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2018 22:31:18 -0800 Subject: ipv6: ip6_make_skb() needs to clear cork.base.dst In my last patch, I missed fact that cork.base.dst was not initialized in ip6_make_skb() : If ip6_setup_cork() returns an error, we might attempt a dst_release() on some random pointer. Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8fe58a2d305c..4f7d8de56611 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1735,6 +1735,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); if (err) { -- cgit v1.2.3 From 30be8f8dba1bd2aff73e8447d59228471233a3d4 Mon Sep 17 00:00:00 2001 From: "r.hering@avm.de" Date: Fri, 12 Jan 2018 15:42:06 +0100 Subject: net/tls: Fix inverted error codes to avoid endless loop sendfile() calls can hang endless with using Kernel TLS if a socket error occurs. Socket error codes must be inverted by Kernel TLS before returning because they are stored with positive sign. If returned non-inverted they are interpreted as number of bytes sent, causing endless looping of the splice mechanic behind sendfile(). Signed-off-by: Robert Hering Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_sw.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 936cfc5cab7d..9185e53a743c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -170,7 +170,7 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) static inline void tls_err_abort(struct sock *sk) { - sk->sk_err = -EBADMSG; + sk->sk_err = EBADMSG; sk->sk_error_report(sk); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 73d19210dd49..9773571b6a34 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -391,7 +391,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) while (msg_data_left(msg)) { if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto send_end; } @@ -544,7 +544,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, size_t copy, required_size; if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto sendpage_end; } -- cgit v1.2.3 From 95a332088ecb113c2e8753fa3f1df9b0dda9beec Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 12 Jan 2018 12:29:22 -0800 Subject: Revert "openvswitch: Add erspan tunnel support." This reverts commit ceaa001a170e43608854d5290a48064f57b565ed. The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr should be designed as a nested attribute to support all ERSPAN v1 and v2's fields. The current attr is a be32 supporting only one field. Thus, this patch reverts it and later patch will redo it using nested attr. Signed-off-by: William Tu Cc: Jiri Benc Cc: Pravin Shelar Acked-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 - net/openvswitch/flow_netlink.c | 51 +--------------------------------------- 2 files changed, 1 insertion(+), 51 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 4265d7f9e1f2..dcfab5e3b55c 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -363,7 +363,6 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, - OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* be32 ERSPAN index. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624ea74353dd..f143908b651d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -49,7 +49,6 @@ #include #include #include -#include #include "flow_netlink.h" @@ -334,8 +333,7 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ - + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ } static size_t ovs_nsh_key_attr_size(void) @@ -402,7 +400,6 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, - [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; static const struct ovs_len_tbl @@ -634,33 +631,6 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } -static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - unsigned long opt_key_offset; - struct erspan_metadata opts; - - BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - - memset(&opts, 0, sizeof(opts)); - opts.index = nla_get_be32(attr); - - /* Index has only 20-bit */ - if (ntohl(opts.index) & ~INDEX_MASK) { - OVS_NLERR(log, "ERSPAN index number %x too large.", - ntohl(opts.index)); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); - opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), - is_mask); - - return 0; -} - static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -768,19 +738,6 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - if (opts_type) { - OVS_NLERR(log, "Multiple metadata blocks provided"); - return -EINVAL; - } - - err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); - if (err) - return err; - - tun_flags |= TUNNEL_ERSPAN_OPT; - opts_type = type; - break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -905,10 +862,6 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_ERSPAN_OPT && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, - ((struct erspan_metadata *)tun_opts)->index)) - return -EMSGSIZE; } return 0; @@ -2533,8 +2486,6 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - break; } }; -- cgit v1.2.3 From 096b9854c04df86f03b38a97d40b6506e5730919 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:50 -0800 Subject: net: Allow neigh contructor functions ability to modify the primary_key Use n->primary_key instead of pkey to account for the possibility that a neigh constructor function may have modified the primary_key value. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d1f5fe986edd..7f831711b6e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -532,7 +532,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -544,7 +544,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; -- cgit v1.2.3 From cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller --- include/net/arp.h | 3 +++ net/ipv4/arp.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/arp.h b/include/net/arp.h index dc8cd47f883b..977aabfcdc03 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -20,6 +20,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a8d7c5a9fb05..6c231b43974d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { -- cgit v1.2.3 From 625637bf4afa45204bd87e4218645182a919485a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:19 +0800 Subject: sctp: reinit stream if stream outcnt has been change by sinit in sendmsg After introducing sctp_stream structure, sctp uses stream->outcnt as the out stream nums instead of c.sinit_num_ostreams. However when users use sinit in cmsg, it only updates c.sinit_num_ostreams in sctp_sendmsg. At that moment, stream->outcnt is still using previous value. If it's value is not updated, the sinit_num_ostreams of sinit could not really work. This patch is to fix it by updating stream->outcnt and reiniting stream if stream outcnt has been change by sinit in sendmsg. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9b01e994f661..15ae018b386f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1883,8 +1883,14 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) */ if (sinit) { if (sinit->sinit_num_ostreams) { - asoc->c.sinit_num_ostreams = - sinit->sinit_num_ostreams; + __u16 outcnt = sinit->sinit_num_ostreams; + + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, so re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, + GFP_KERNEL); + if (err) + goto out_free; } if (sinit->sinit_max_instreams) { asoc->c.sinit_max_instreams = -- cgit v1.2.3 From a0ff660058b88d12625a783ce9e5c1371c87951f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:36 +0800 Subject: sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing. If the sk's last one refcnt is held by this asoc, later on after putting this asoc, the new sk will be freed, while under it's own lock. This patch is to revert that commit, but fix the old issue by returning error under the old sk's lock. Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep") Reported-by: syzbot+ac6ea7baa4432811eb50@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 15ae018b386f..feb2ca69827a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -85,7 +85,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1977,7 +1977,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -8022,12 +8022,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -8056,17 +8056,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ -- cgit v1.2.3 From c5006b8aa74599ce19104b31d322d2ea9ff887cc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:02:00 +0800 Subject: sctp: do not allow the v4 socket to bind a v4mapped v6 address The check in sctp_sockaddr_af is not robust enough to forbid binding a v4mapped v6 addr on a v4 socket. The worse thing is that v4 socket's bind_verify would not convert this v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4 socket bound a v6 addr. This patch is to fix it by doing the common sa.sa_family check first, then AF_INET check for v4mapped v6 addrs. Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.") Reported-by: syzbot+7b7b518b1228d2743963@syzkaller.appspotmail.com Acked-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index feb2ca69827a..039fcb618c34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -335,16 +335,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); -- cgit v1.2.3 From 81d947e2b8dd2394586c3eaffdd2357797d3bf59 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 15 Jan 2018 23:12:09 +0100 Subject: net, sched: fix panic when updating miniq {b,q}stats While working on fixing another bug, I ran into the following panic on arm64 by simply attaching clsact qdisc, adding a filter and running traffic on ingress to it: [...] [ 178.188591] Unable to handle kernel read from unreadable memory at virtual address 810fb501f000 [ 178.197314] Mem abort info: [ 178.200121] ESR = 0x96000004 [ 178.203168] Exception class = DABT (current EL), IL = 32 bits [ 178.209095] SET = 0, FnV = 0 [ 178.212157] EA = 0, S1PTW = 0 [ 178.215288] Data abort info: [ 178.218175] ISV = 0, ISS = 0x00000004 [ 178.222019] CM = 0, WnR = 0 [ 178.224997] user pgtable: 4k pages, 48-bit VAs, pgd = 0000000023cb3f33 [ 178.231531] [0000810fb501f000] *pgd=0000000000000000 [ 178.236508] Internal error: Oops: 96000004 [#1] SMP [...] [ 178.311855] CPU: 73 PID: 2497 Comm: ping Tainted: G W 4.15.0-rc7+ #5 [ 178.319413] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB18A 03/31/2017 [ 178.326887] pstate: 60400005 (nZCv daif +PAN -UAO) [ 178.331685] pc : __netif_receive_skb_core+0x49c/0xac8 [ 178.336728] lr : __netif_receive_skb+0x28/0x78 [ 178.341161] sp : ffff00002344b750 [ 178.344465] x29: ffff00002344b750 x28: ffff810fbdfd0580 [ 178.349769] x27: 0000000000000000 x26: ffff000009378000 [...] [ 178.418715] x1 : 0000000000000054 x0 : 0000000000000000 [ 178.424020] Process ping (pid: 2497, stack limit = 0x000000009f0a3ff4) [ 178.430537] Call trace: [ 178.432976] __netif_receive_skb_core+0x49c/0xac8 [ 178.437670] __netif_receive_skb+0x28/0x78 [ 178.441757] process_backlog+0x9c/0x160 [ 178.445584] net_rx_action+0x2f8/0x3f0 [...] Reason is that sch_ingress and sch_clsact are doing mini_qdisc_pair_init() which sets up miniq pointers to cpu_{b,q}stats from the underlying qdisc. Problem is that this cannot work since they are actually set up right after the qdisc ->init() callback in qdisc_create(), so first packet going into sch_handle_ingress() tries to call mini_qdisc_bstats_cpu_update() and we therefore panic. In order to fix this, allocation of {b,q}stats needs to happen before we call into ->init(). In net-next, there's already such option through commit d59f5ffa59d8 ("net: sched: a dflt qdisc may be used with per cpu stats"). However, the bug needs to be fixed in net still for 4.15. Thus, include these bits to reduce any merge churn and reuse the static_flags field to set TCQ_F_CPUSTATS, and remove the allocation from qdisc_create() since there is no other user left. Prashant Bhole ran into the same issue but for net-next, thus adding him below as well as co-author. Same issue was also reported by Sandipan Das when using bcc. Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath") Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2018-January/001190.html Reported-by: Sandipan Das Co-authored-by: Prashant Bhole Co-authored-by: John Fastabend Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ net/sched/sch_api.c | 15 +-------------- net/sched/sch_generic.c | 18 +++++++++++++++++- net/sched/sch_ingress.c | 19 ++++--------------- 4 files changed, 24 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 83a3e47d5845..becf86aa4ac6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -179,6 +179,7 @@ struct Qdisc_ops { const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; + unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -444,6 +445,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops); +void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid); void __qdisc_calculate_pkt_len(struct sk_buff *skb, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0f1eab99ff4e..52529b7f8d96 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1063,17 +1063,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { - if (qdisc_is_percpu_stats(sch)) { - sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); - if (!sch->cpu_bstats) - goto err_out4; - - sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); - if (!sch->cpu_qstats) - goto err_out4; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -1115,7 +1104,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, ops->destroy(sch); err_out3: dev_put(dev); - kfree((char *) sch - sch->padded); + qdisc_free(sch); err_out2: module_put(ops->owner); err_out: @@ -1123,8 +1112,6 @@ err_out: return NULL; err_out4: - free_percpu(sch->cpu_bstats); - free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 661c7144b53a..cac003fddf3e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -633,6 +633,19 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -642,6 +655,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -649,6 +663,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } @@ -698,7 +714,7 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_free(struct Qdisc *qdisc) +void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index fc1286f499c1..003e1b063447 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -66,7 +66,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; net_inc_ingress_queue(); @@ -76,13 +75,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; - err = tcf_block_get_ext(&q->block, sch, &q->block_info); - if (err) - return err; - - sch->flags |= TCQ_F_CPUSTATS; - - return 0; + return tcf_block_get_ext(&q->block, sch, &q->block_info); } static void ingress_destroy(struct Qdisc *sch) @@ -121,6 +114,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), + .static_flags = TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -192,13 +186,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) q->egress_block_info.chain_head_change = clsact_chain_head_change; q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; - err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); - if (err) - return err; - - sch->flags |= TCQ_F_CPUSTATS; - - return 0; + return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); } static void clsact_destroy(struct Qdisc *sch) @@ -225,6 +213,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), + .static_flags = TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, -- cgit v1.2.3 From d91c3e17f75f218022140dee18cf515292184a8f Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Tue, 16 Jan 2018 15:31:52 +0200 Subject: net/tls: Only attach to sockets in ESTABLISHED state Calling accept on a TCP socket with a TLS ulp attached results in two sockets that share the same ulp context. The ulp context is freed while a socket is destroyed, so after one of the sockets is released, the second second will trigger a use after free when it tries to access the ulp context attached to it. We restrict the TLS ulp to sockets in ESTABLISHED state to prevent the scenario above. Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+904e7cd6c5c741609228@syzkaller.appspotmail.com Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e07ee3ae0023..7b7a70e22d90 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -454,6 +454,15 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; + /* The TLS ulp is currently supported only for TCP sockets + * in ESTABLISHED state. + * Supporting sockets in LISTEN state will require us + * to modify the accept implementation to clone rather then + * share the ulp context. + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTSUPP; + /* allocate tls context */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { -- cgit v1.2.3 From cf6d43ef66f416282121f436ce1bee9a25199d52 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:26 +0100 Subject: tls: fix sw_ctx leak During setsockopt(SOL_TCP, TLS_TX), if initialization of the software context fails in tls_set_sw_offload(), we leak sw_ctx. We also don't reassign ctx->priv_ctx to NULL, so we can't even do another attempt to set it up on the same socket, as it will fail with -EEXIST. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9773571b6a34..61f394d369bf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -681,18 +681,17 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } default: rc = -EINVAL; - goto out; + goto free_priv; } ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; ctx->tag_size = tag_size; ctx->overhead_size = ctx->prepend_size + ctx->tag_size; ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->iv) { rc = -ENOMEM; - goto out; + goto free_priv; } memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); @@ -740,7 +739,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); if (!rc) - goto out; + return 0; free_aead: crypto_free_aead(sw_ctx->aead_send); @@ -751,6 +750,9 @@ free_rec_seq: free_iv: kfree(ctx->iv); ctx->iv = NULL; +free_priv: + kfree(ctx->priv_ctx); + ctx->priv_ctx = NULL; out: return rc; } -- cgit v1.2.3 From 877d17c79b66466942a836403773276e34fe3614 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:27 +0100 Subject: tls: return -EBUSY if crypto_info is already set do_tls_setsockopt_tx returns 0 without doing anything when crypto_info is already set. Silent failure is confusing for users. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7b7a70e22d90..8e9cbfd21423 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -367,8 +367,10 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, crypto_info = &ctx->crypto_send; /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + rc = -EBUSY; goto out; + } rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { -- cgit v1.2.3 From 6db959c82eb039a151d95a0f8b7dea643657327a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:28 +0100 Subject: tls: reset crypto_info when do_tls_setsockopt_tx fails The current code copies directly from userspace to ctx->crypto_send, but doesn't always reinitialize it to 0 on failure. This causes any subsequent attempt to use this setsockopt to fail because of the TLS_CRYPTO_INFO_READY check, eventhough crypto_info is not actually ready. This should result in a correctly set up socket after the 3rd call, but currently it does not: size_t s = sizeof(struct tls12_crypto_info_aes_gcm_128); struct tls12_crypto_info_aes_gcm_128 crypto_good = { .info.version = TLS_1_2_VERSION, .info.cipher_type = TLS_CIPHER_AES_GCM_128, }; struct tls12_crypto_info_aes_gcm_128 crypto_bad_type = crypto_good; crypto_bad_type.info.cipher_type = 42; setsockopt(sock, SOL_TLS, TLS_TX, &crypto_bad_type, s); setsockopt(sock, SOL_TLS, TLS_TX, &crypto_good, s - 1); setsockopt(sock, SOL_TLS, TLS_TX, &crypto_good, s); Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 8e9cbfd21423..736719c8314e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -388,7 +388,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; - goto out; + goto err_crypto_info; } rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), optlen - sizeof(*crypto_info)); @@ -400,7 +400,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } default: rc = -EINVAL; - goto out; + goto err_crypto_info; } /* currently SW is default, we will have ethtool in future */ -- cgit v1.2.3 From ad9294dbc227cbc8e173b3b963e7dd9af5314f77 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 17 Jan 2018 22:36:49 +0100 Subject: bpf: fix cls_bpf on filter replace Running the following sequence is currently broken: # tc qdisc add dev foo clsact # tc filter replace dev foo ingress prio 1 handle 1 bpf da obj bar.o # tc filter replace dev foo ingress prio 1 handle 1 bpf da obj bar.o RTNETLINK answers: Invalid argument The normal expectation on kernel side is that the second command succeeds replacing the existing program. However, what happens is in cls_bpf_change(), we bail out with err in the second run in cls_bpf_offload(). The EINVAL comes directly in cls_bpf_offload() when comparing prog vs oldprog's gen_flags. In case of above replace the new prog's gen_flags are 0, but the old ones are 8, which means TCA_CLS_FLAGS_NOT_IN_HW is set (e.g. drivers not having cls_bpf offload). Fix 102740bd9436 ("cls_bpf: fix offload assumptions after callback conversion") in the following way: gen_flags from user space passed down via netlink cannot include status flags like TCA_CLS_FLAGS_IN_HW or TCA_CLS_FLAGS_NOT_IN_HW as opposed to oldprog that we previously loaded. Therefore, it doesn't make any sense to include them in the gen_flags comparison with the new prog before we even attempt to offload. Thus, lets fix this before 4.15 goes out. Fixes: 102740bd9436 ("cls_bpf: fix offload assumptions after callback conversion") Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 8d78e7f4ecc3..a62586e2dbdb 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -183,10 +183,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, return 0; } +static u32 cls_bpf_flags(u32 flags) +{ + return flags & CLS_BPF_SUPPORTED_GEN_FLAGS; +} + static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - if (prog && oldprog && prog->gen_flags != oldprog->gen_flags) + if (prog && oldprog && + cls_bpf_flags(prog->gen_flags) != + cls_bpf_flags(oldprog->gen_flags)) return -EINVAL; if (prog && tc_skip_hw(prog->gen_flags)) -- cgit v1.2.3 From 8cb68751c115d176ec851ca56ecfbb411568c9e8 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: can: af_can: can_rcv(): replace WARN_ONCE by pr_warn_once If an invalid CAN frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+4386709c0c1284dca827@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 003b2d6d655f..ae835382e678 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -721,20 +721,16 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From d4689846881d160a4d12a514e991a740bcb5d65a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: can: af_can: canfd_rcv(): replace WARN_ONCE by pr_warn_once If an invalid CANFD frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+e3b775f40babeff6e68b@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index ae835382e678..4d7f988a3130 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -738,20 +738,16 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } /* -- cgit v1.2.3 From cc01572e2fb080e279ca625f239aca61f435ebf3 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Wed, 17 Jan 2018 15:52:41 +0200 Subject: xfrm: Add SA to hardware at the end of xfrm_state_construct() Current code configures the hardware with a new SA before the state has been fully initialized. During this time interval, an incoming ESP packet can cause a crash due to a NULL dereference. More specifically, xfrm_input() considers the packet as valid, and yet, anti-replay mechanism is not initialized. Move hardware configuration to the end of xfrm_state_construct(), and mark the state as valid once the SA is fully initialized. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Aviad Yehezkel Signed-off-by: Aviv Heller Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 10 +++++++--- net/xfrm/xfrm_user.c | 18 +++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 429957412633..2d486492acdb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2272,8 +2272,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) goto error; } - x->km.state = XFRM_STATE_VALID; - error: return err; } @@ -2282,7 +2280,13 @@ EXPORT_SYMBOL(__xfrm_init_state); int xfrm_init_state(struct xfrm_state *x) { - return __xfrm_init_state(x, true, false); + int err; + + err = __xfrm_init_state(x, true, false); + if (!err) + x->km.state = XFRM_STATE_VALID; + + return err; } EXPORT_SYMBOL(xfrm_init_state); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index bdb48e5dba04..7f52b8eb177d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -598,13 +598,6 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - if (attrs[XFRMA_OFFLOAD_DEV]) { - err = xfrm_dev_state_add(net, x, - nla_data(attrs[XFRMA_OFFLOAD_DEV])); - if (err) - goto error; - } - if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@ -620,6 +613,14 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + /* configure the hardware if offload is requested */ + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_state_add(net, x, + nla_data(attrs[XFRMA_OFFLOAD_DEV])); + if (err) + goto error; + } + return x; error: @@ -662,6 +663,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + if (x->km.state == XFRM_STATE_VOID) + x->km.state = XFRM_STATE_VALID; + c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; -- cgit v1.2.3 From cd443f1e91ca600a092e780e8250cd6a2954b763 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 18 Jan 2018 14:48:03 +0800 Subject: netlink: reset extack earlier in netlink_rcv_skb Move up the extack reset/initialization in netlink_rcv_skb, so that those 'goto ack' will not skip it. Otherwise, later on netlink_ack may use the uninitialized extack and cause kernel crash. Fixes: cbbdf8433a5f ("netlink: extack needs to be reset each time through loop") Reported-by: syzbot+03bee3680a37466775e7@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 47ef2d8683d6..84a4e4c3be4b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2391,6 +2391,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, while (skb->len >= nlmsg_total_size(0)) { int msglen; + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -2405,7 +2406,6 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; - memset(&extack, 0, sizeof(extack)); err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; -- cgit v1.2.3 From 5762d7d3eda25c03cc2d9d45227be3f5ab6bec9e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jan 2018 23:20:22 +0100 Subject: cfg80211: fix station info handling bugs Fix two places where the structure isn't initialized to zero, and thus can't be filled properly by the driver. Fixes: 4a4b8169501b ("cfg80211: Accept multiple RSSI thresholds for CQM") Fixes: 9930380f0bd8 ("cfg80211: implement IWRATE") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/wireless/nl80211.c | 2 +- net/wireless/wext-compat.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ed87a97fcb0b..542a4fc0a8d7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9809,7 +9809,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, */ if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss && rdev->ops->get_station) { - struct station_info sinfo; + struct station_info sinfo = {}; u8 *mac_addr; mac_addr = wdev->current_bss->pub.bssid; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7ca04a7de85a..05186a47878f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1254,8 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - /* we are under RTNL - globally locked - so can use a static struct */ - static struct station_info sinfo; + struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; -- cgit v1.2.3 From d0c081b49137cd3200f2023c0875723be66e7ce5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Jan 2018 14:21:13 -0800 Subject: flow_dissector: properly cap thoff field syzbot reported yet another crash [1] that is caused by insufficient validation of DODGY packets. Two bugs are happening here to trigger the crash. 1) Flow dissection leaves with incorrect thoff field. 2) skb_probe_transport_header() sets transport header to this invalid thoff, even if pointing after skb valid data. 3) qdisc_pkt_len_init() reads out-of-bound data because it trusts tcp_hdrlen(skb) Possible fixes : - Full flow dissector validation before injecting bad DODGY packets in the stack. This approach was attempted here : https://patchwork.ozlabs.org/patch/ 861874/ - Have more robust functions in the core. This might be needed anyway for stable versions. This patch fixes the flow dissection issue. [1] CPU: 1 PID: 3144 Comm: syzkaller271204 Not tainted 4.15.0-rc4-mm1+ #49 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:355 [inline] kasan_report+0x23b/0x360 mm/kasan/report.c:413 __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:432 __tcp_hdrlen include/linux/tcp.h:35 [inline] tcp_hdrlen include/linux/tcp.h:40 [inline] qdisc_pkt_len_init net/core/dev.c:3160 [inline] __dev_queue_xmit+0x20d3/0x2200 net/core/dev.c:3465 dev_queue_xmit+0x17/0x20 net/core/dev.c:3554 packet_snd net/packet/af_packet.c:2943 [inline] packet_sendmsg+0x3ad5/0x60a0 net/packet/af_packet.c:2968 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 sock_write_iter+0x31a/0x5d0 net/socket.c:907 call_write_iter include/linux/fs.h:1776 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x684/0x970 fs/read_write.c:482 vfs_write+0x189/0x510 fs/read_write.c:544 SYSC_write fs/read_write.c:589 [inline] SyS_write+0xef/0x220 fs/read_write.c:581 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 34fad54c2537 ("net: __skb_flow_dissect() must cap its return value") Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Jason Wang Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15ce30063765..544bddf08e13 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -976,8 +976,8 @@ ip_proto_again: out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -985,7 +985,6 @@ out: out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); -- cgit v1.2.3 From 128bb975dc3c25d00de04e503e2fe0a780d04459 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 18 Jan 2018 20:51:12 +0300 Subject: ip6_gre: init dev->mtu and dev->hard_header_len correctly Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") moved dev->mtu initialization from ip6gre_tunnel_setup() to ip6gre_tunnel_init(), as a result, the previously set values, before ndo_init(), are reset in the following cases: * rtnl_create_link() can update dev->mtu from IFLA_MTU parameter. * ip6gre_tnl_link_config() is invoked before ndo_init() in netlink and ioctl setup, so ndo_init() can reset MTU adjustments with the lower device MTU as well, dev->mtu and dev->hard_header_len. Not applicable for ip6gretap because it has one more call to ip6gre_tnl_link_config(tunnel, 1) in ip6gre_tap_init(). Fix the first case by updating dev->mtu with 'tb[IFLA_MTU]' parameter if a user sets it manually on a device creation, and fix the second one by moving ip6gre_tnl_link_config() call after register_netdevice(). Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") Fixes: db2ec95d1ba4 ("ip6_gre: Fix MTU setting") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 772695960890..873549228ccb 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -337,11 +337,12 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) goto failed_free; + ip6gre_tnl_link_config(nt, 1); + /* Can use a lockless transmit, unless we generate output sequences */ if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; @@ -1303,7 +1304,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { - struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); @@ -1312,10 +1312,6 @@ static int ip6gre_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - - ip6gre_tnl_link_config(tunnel, 1); - return 0; } @@ -1408,12 +1404,16 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); err = register_netdevice(dev); if (err) goto out; + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + if (tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + dev_hold(dev); ip6gre_tunnel_link(ign, nt); -- cgit v1.2.3 From 591ff9ea51cec683e4cb378a3469228ba1d69010 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 18 Jan 2018 10:40:03 -0800 Subject: ipv6: don't let tb6_root node share routes with other node After commit 4512c43eac7e, if we add a route to the subtree of tb6_root which does not have any route attached to it yet, the current code will let tb6_root and the node in the subtree share the same route. This could cause problem cause tb6_root has RTN_INFO flag marked and the tree repair and clean up code will not work properly. This commit makes sure tb6_root->leaf points back to null_entry instead of sharing route with other node. It fixes the following syzkaller reported issue: BUG: KASAN: use-after-free in ipv6_prefix_equal include/net/ipv6.h:540 [inline] BUG: KASAN: use-after-free in fib6_add_1+0x165f/0x1790 net/ipv6/ip6_fib.c:618 Read of size 8 at addr ffff8801bc043498 by task syz-executor5/19819 CPU: 1 PID: 19819 Comm: syz-executor5 Not tainted 4.15.0-rc7+ #186 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 ipv6_prefix_equal include/net/ipv6.h:540 [inline] fib6_add_1+0x165f/0x1790 net/ipv6/ip6_fib.c:618 fib6_add+0x5fa/0x1540 net/ipv6/ip6_fib.c:1214 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1003 ip6_route_add+0x141/0x190 net/ipv6/route.c:2790 ipv6_route_ioctl+0x4db/0x6b0 net/ipv6/route.c:3299 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:520 sock_do_ioctl+0x65/0xb0 net/socket.c:958 sock_ioctl+0x2c2/0x440 net/socket.c:1055 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686 SYSC_ioctl fs/ioctl.c:701 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692 entry_SYSCALL_64_fastpath+0x23/0x9a RIP: 0033:0x452ac9 RSP: 002b:00007fd42b321c58 EFLAGS: 00000212 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 000000000071bea0 RCX: 0000000000452ac9 RDX: 0000000020fd7000 RSI: 000000000000890b RDI: 0000000000000013 RBP: 000000000000049e R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006f4f70 R13: 00000000ffffffff R14: 00007fd42b3226d4 R15: 0000000000000000 Fixes: 4512c43eac7e ("ipv6: remove null_entry before adding default route") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9dcc3924a975..217683d40f12 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1226,8 +1226,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } if (!rcu_access_pointer(fn->leaf)) { - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(fn->leaf, rt); + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.ip6_null_entry); + } else { + atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } -- cgit v1.2.3 From aa5dd6fa6f5d4bdc82a67e952bba8ad2e98d77e2 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Thu, 18 Jan 2018 15:41:51 +0200 Subject: xfrm: fix error flow in case of add state fails If add state fails in case of device offload, netdev refcount will be negative since gc task is attempting to dev_free this state. This is fixed by putting NULL in state dev field. Signed-off-by: Aviad Yehezkel Signed-off-by: Boris Pismeny Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 30e5746085b8..ac9477189d1c 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -102,6 +102,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { + xso->dev = NULL; dev_put(dev); return err; } -- cgit v1.2.3 From b589513e6354a5fd6934823b7fd66bffad41137a Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 18 Jan 2018 13:11:07 -0800 Subject: rds: tcp: compute m_ack_seq as offset from ->write_seq rds-tcp uses m_ack_seq to track the tcp ack# that indicates that the peer has received a rds_message. The m_ack_seq is used in rds_tcp_is_acked() to figure out when it is safe to drop the rds_message from the RDS retransmit queue. The m_ack_seq must be calculated as an offset from the right edge of the in-flight tcp buffer, i.e., it should be based on the ->write_seq, not the ->snd_nxt. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 5 +++-- net/rds/tcp.h | 2 +- net/rds/tcp_send.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 6b7ee71f40c6..ab7356e0ba83 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -90,9 +90,10 @@ void rds_tcp_nonagle(struct socket *sock) sizeof(val)); } -u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { - return tcp_sk(tc->t_sock->sk)->snd_nxt; + /* seq# of the last byte of data in tcp send buffer */ + return tcp_sk(tc->t_sock->sk)->write_seq; } u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 1aafbf7c3011..864ca7d8f019 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -54,7 +54,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); -u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); +u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index dc860d1bb608..9b76e0fa1722 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -86,7 +86,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, * m_ack_seq is set to the sequence number of the last byte of * header and data. see rds_tcp_is_acked(). */ - tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); + tc->t_last_sent_nxt = rds_tcp_write_seq(tc); rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; @@ -98,7 +98,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", - rm, rds_tcp_snd_nxt(tc), + rm, rds_tcp_write_seq(tc), (unsigned long long)rm->m_ack_seq); } -- cgit v1.2.3 From 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2018 19:59:19 -0800 Subject: net: qdisc_pkt_len_init() should be more robust Without proper validation of DODGY packets, we might very well feed qdisc_pkt_len_init() with invalid GSO packets. tcp_hdrlen() might access out-of-bound data, so let's use skb_header_pointer() and proper checks. Whole story is described in commit d0c081b49137 ("flow_dissector: properly cap thoff field") We have the goal of validating DODGY packets earlier in the stack, so we might very well revert this fix in the future. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Jason Wang Reported-by: syzbot+9da69ebac7dddd804552@syzkaller.appspotmail.com Acked-by: Jason Wang Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0e0ba36eeac9..613fb4066be7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3151,10 +3151,21 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, -- cgit v1.2.3 From 121d57af308d0cf943f08f4738d24d3966c38cd9 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 19 Jan 2018 09:29:18 -0500 Subject: gso: validate gso_type in GSO handlers Validate gso_type during segmentation as SKB_GSO_DODGY sources may pass packets where the gso_type does not match the contents. Syzkaller was able to enter the SCTP gso handler with a packet of gso_type SKB_GSO_TCPV4. On entry of transport layer gso handlers, verify that the gso_type matches the transport protocol. Fixes: 90017accff61 ("sctp: Add GSO support") Link: http://lkml.kernel.org/r/<001a1137452496ffc305617e5fe0@google.com> Reported-by: syzbot+fee64147a25aecd48055@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/ipv4/esp4_offload.c | 3 +++ net/ipv4/tcp_offload.c | 3 +++ net/ipv4/udp_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ net/ipv6/tcpv6_offload.c | 3 +++ net/ipv6/udp_offload.c | 3 +++ net/sctp/offload.c | 3 +++ 7 files changed, 21 insertions(+) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index b1338e576d00..29b333a62ab0 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,6 +122,9 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!xo) goto out; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) + goto out; + seq = xo->seq.low; x = skb->sp->xvec[skb->sp->len - 1]; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b6a2aa1dcf56..4d58e2ce0b5b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -32,6 +32,9 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 01801b77bd0d..ea6e6e7df0ee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -203,6 +203,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index dd9627490c7c..f52c314d4c97 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -149,6 +149,9 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!xo) goto out; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) + goto out; + seq = xo->seq.low; x = skb->sp->xvec[skb->sp->len - 1]; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index d883c9204c01..278e49cd67d4 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -46,6 +46,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, { struct tcphdr *th; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a0f89ad76f9d..2a04dc9c781b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,6 +42,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, const struct ipv6hdr *ipv6h; struct udphdr *uh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 275925b93b29..35bc7106d182 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -45,6 +45,9 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)) + goto out; + sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; -- cgit v1.2.3 From ad23b750933ea7bf962678972a286c78a8fa36aa Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 19 Jan 2018 11:50:46 +0100 Subject: net: igmp: fix source address check for IGMPv3 reports Commit "net: igmp: Use correct source address on IGMPv3 reports" introduced a check to validate the source address of locally generated IGMPv3 packets. Instead of checking the local interface address directly, it uses inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the local subnet (or equal to the point-to-point address if used). This breaks for point-to-point interfaces, so check against ifa->ifa_local directly. Cc: Kevin Cernekee Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Reported-by: Sebastian Gottschall Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 726f6b608274..2d49717a7421 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -332,7 +332,7 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, return htonl(INADDR_ANY); for_ifa(in_dev) { - if (inet_ifa_match(fl4->saddr, ifa)) + if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } endfor_ifa(in_dev); -- cgit v1.2.3 From 7a8c4dd9be91a7e8f8f0e0419a560663adc694a3 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Fri, 19 Jan 2018 12:30:13 -0800 Subject: tls: Correct length of scatterlist in tls_sw_sendpage The scatterlist is reused by both sendmsg and sendfile. If a sendmsg of smaller number of pages is followed by a sendfile of larger number of pages, the scatterlist may be too short, resulting in a crash in gcm_encrypt. Add sg_unmark_end to make the list the correct length. tls_sw_sendmsg already calls sg_unmark_end correctly when it allocates memory in alloc_sg, or in zerocopy_from_iter. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 61f394d369bf..0a9b72fbd761 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -577,6 +577,8 @@ alloc_payload: get_page(page); sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); + sg_unmark_end(sg); + ctx->sg_plaintext_num_elem++; sk_mem_charge(sk, copy); -- cgit v1.2.3 From 5efec5c655dd31944af440ff2b0a93facc4a7762 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Tue, 23 Jan 2018 00:16:21 +0200 Subject: xfrm: Fix eth_hdr(skb)->h_proto to reflect inner IP version IPSec tunnel mode supports encapsulation of IPv4 over IPv6 and vice-versa. The outer IP header is stripped and the inner IP inherits the original Ethernet header. Tcpdump fails to properly decode the inner packet in case that h_proto is different than the inner IP version. Fix h_proto to reflect the inner IP version. Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_tunnel.c | 1 + net/ipv6/xfrm6_mode_tunnel.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e..20ca486b3cad 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -92,6 +92,7 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87..dc93002ff9d1 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -92,6 +92,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; -- cgit v1.2.3 From 545d8ae7affff7fb4f8bfd327c7c7790056535c4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Jan 2018 16:34:09 -0600 Subject: xfrm: fix boolean assignment in xfrm_get_type_offload Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Fixes: ffdb5211da1c ("xfrm: Auto-load xfrm offload modules") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2d486492acdb..a3785f538018 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -317,7 +317,7 @@ retry: if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); - try_load = 0; + try_load = false; goto retry; } -- cgit v1.2.3 From e9191ffb65d8e159680ce0ad2224e1acbde6985c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f73797e2fa60..221238254eb7 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -331,6 +331,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4f7d8de56611..3763dc01e374 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -166,7 +166,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { if (!np->autoflowlabel_set) return ip6_default_np_autolabel(net); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2d4680e0376f..e8ffb5b5d84e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1336,7 +1336,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; case IPV6_RECVFRAGSIZE: -- cgit v1.2.3 From 560a66075d694e6ec24c60967b4d93d97cbb33d1 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Thu, 18 Jan 2018 11:32:35 +0100 Subject: net: sched: em_nbyte: don't add the data offset twice 'ptr' is shifted by the offset and then validated, the memcmp should not add it a second time. Signed-off-by: Wolfgang Bumiller Signed-off-by: David S. Miller --- net/sched/em_nbyte.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index df3110d69585..07c10bac06a0 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) return 0; - return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); + return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len); } static struct tcf_ematch_ops em_nbyte_ops = { -- cgit v1.2.3 From 581e7226a5d43f629eb6399a121f85f6a15f81be Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 24 Jan 2018 12:35:40 -0800 Subject: kcm: Only allow TCP sockets to be attached to a KCM mux TCP sockets for IPv4 and IPv6 that are not listeners or in closed stated are allowed to be attached to a KCM mux. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot+8865eaff7f9acd593945@syzkaller.appspotmail.com Signed-off-by: Tom Herbert Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d4e98f20fc2a..7632797fb68e 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1387,8 +1387,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock, if (!csk) return -EINVAL; - /* We must prevent loops or risk deadlock ! */ - if (csk->sk_family == PF_KCM) + /* Only allow TCP sockets to be attached for now */ + if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || + csk->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + /* Don't allow listeners or closed sockets */ + if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) return -EOPNOTSUPP; psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); -- cgit v1.2.3 From e5571240236c5652f3e079b1d5866716a7ad819c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 24 Jan 2018 12:35:41 -0800 Subject: kcm: Check if sk_user_data already set in kcm_attach This is needed to prevent sk_user_data being overwritten. The check is done under the callback lock. This should prevent a socket from being attached twice to a KCM mux. It also prevents a socket from being attached for other use cases of sk_user_data as long as the other cases set sk_user_data under the lock. Followup work is needed to unify all the use cases of sk_user_data to use the same locking. Reported-by: syzbot+114b15f2be420a8886c3@syzkaller.appspotmail.com Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Signed-off-by: Tom Herbert Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 7632797fb68e..4a8d407f8902 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1410,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, return err; } - sock_hold(csk); - write_lock_bh(&csk->sk_callback_lock); + + /* Check if sk_user_data is aready by KCM or someone else. + * Must be done under lock to prevent race conditions. + */ + if (csk->sk_user_data) { + write_unlock_bh(&csk->sk_callback_lock); + strp_done(&psock->strp); + kmem_cache_free(kcm_psockp, psock); + return -EALREADY; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; @@ -1420,8 +1429,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; + write_unlock_bh(&csk->sk_callback_lock); + sock_hold(csk); + /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; -- cgit v1.2.3 From 4ee806d51176ba7b8ff1efd81f271d7252e03a1d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: net: tcp: close sock if net namespace is exiting When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 10f99dafd5ac..049008493faf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f08eebe60446..8e053ad7cae2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2298,6 +2298,9 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 968fda198376..388158c9d9f6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } -- cgit v1.2.3 From f15ca723c1ebe6c1a06bc95fda6b62cd87b44559 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 19:03:03 +0100 Subject: net: don't call update_pmtu unconditionally Some dst_ops (e.g. md_dst_ops)) doesn't set this handler. It may result to: "BUG: unable to handle kernel NULL pointer dereference at (null)" Let's add a helper to check if update_pmtu is available before calling it. Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") CC: Roman Kapl CC: Xin Long Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 3 +-- drivers/net/geneve.c | 4 ++-- drivers/net/vxlan.c | 6 ++---- include/net/dst.h | 8 ++++++++ net/ipv4/ip_tunnel.c | 3 +-- net/ipv4/ip_vti.c | 2 +- net/ipv6/ip6_tunnel.c | 6 ++---- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 4 ++-- 9 files changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 2c13123bfd69..71ea9e26666c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1456,8 +1456,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, struct ipoib_dev_priv *priv = ipoib_priv(dev); int e = skb_queue_empty(&priv->cm.skb_queue); - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); skb_queue_tail(&priv->cm.skb_queue, skb); if (e) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 0a48b3073d3d..64fda2e1040e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -829,7 +829,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) - GENEVE_BASE_HLEN - info->options_len - 14; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); } sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); @@ -875,7 +875,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) - GENEVE_BASE_HLEN - info->options_len - 14; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); } sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 31f4b7911ef8..c3e34e3c82a7 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2158,8 +2158,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (skb_dst(skb)) { int mtu = dst_mtu(ndst) - VXLAN_HEADROOM; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, - skb, mtu); + skb_dst_update_pmtu(skb, mtu); } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); @@ -2200,8 +2199,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (skb_dst(skb)) { int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, - skb, mtu); + skb_dst_update_pmtu(skb, mtu); } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); diff --git a/include/net/dst.h b/include/net/dst.h index b091fd536098..d49d607dd2b3 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -521,4 +521,12 @@ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) } #endif +static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst->ops->update_pmtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu); +} + #endif /* _NET_DST_H */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5ddb1cb52bd4..6d21068f9b55 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -520,8 +520,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 949f432a5f04..51b1669334fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -200,7 +200,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9a7cf355bc8c..1ee5584c3555 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -642,8 +642,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, - rel_info); + skb_dst_update_pmtu(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -1131,8 +1130,7 @@ route_lookup: mtu = 576; } - if (skb_dst(skb) && !t->parms.collect_md) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index dbb74f3c57a7..8c184f84f353 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -483,7 +483,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d7dc23c1b2ca..3873d3877135 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -934,8 +934,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, df = 0; } - if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tunnel->parms.iph.daddr) + skb_dst_update_pmtu(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -- cgit v1.2.3 From dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 Subject: dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 1c75cd1255f6..92d016e87816 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) -- cgit v1.2.3 From ba3169fc7548759be986b168d662e0ba64c2fd88 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 26 Jan 2018 11:48:25 +0000 Subject: VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING select(2) with wfds but no rfds must return when the socket is shut down by the peer. This way userspace notices socket activity and gets -EPIPE from the next write(2). Currently select(2) does not return for virtio-vsock when a SEND+RCV shutdown packet is received. This is because vsock_poll() only sets POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the socket is in when the shutdown is received. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5d28abf87fbf..c9473d698525 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -951,7 +951,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; -- cgit v1.2.3