From c1ebcdb8c422cd73f54bcd2b9953e443a47667e5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 20:08:59 -0700 Subject: [NET]: Remove obsolete fastroute stats. Remove last vestiages of fastroute code that is no longer used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++-------- net/core/sysctl_net_core.c | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ab935778ce81..4f1ae2efe872 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2056,14 +2056,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", s->total, s->dropped, s->time_squeeze, s->throttled, - s->fastroute_hit, s->fastroute_success, s->fastroute_defer, - s->fastroute_deferred_out, -#if 0 - s->fastroute_latency_reduction -#else - s->cpu_collision -#endif - ); + 0, 0, 0, 0, /* was fastroute */ + s->cpu_collision ); return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 880a88815211..76e9987474ca 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -18,7 +18,6 @@ extern int no_cong_thresh; extern int no_cong; extern int lo_cong; extern int mod_cong; -extern int netdev_fastroute; extern int net_msg_cost; extern int net_msg_burst; -- cgit v1.2.3 From 34008d8c631d067caffa136313260525f3ae48a2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 20:10:00 -0700 Subject: [NET]: Remove obsolete netif_rx congestion sensing mechanism. Remove the congestion sensing mechanism from netif_rx, and always return either full or empty. Almost no driver checks the return value from netif_rx, and those that do only use it for debug messages. The original design of netif_rx was to do flow control based on the receive queue, but NAPI has supplanted this and no driver uses the feedback. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- net/core/dev.c | 88 +--------------------------------------------- net/core/sysctl_net_core.c | 36 ------------------- 3 files changed, 1 insertion(+), 125 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2e15e381a58..718ad579c65c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -558,8 +558,6 @@ static inline int unregister_gifconf(unsigned int family) struct softnet_data { int throttle; - int cng_level; - int avg_blog; struct sk_buff_head input_pkt_queue; struct list_head poll_list; struct net_device *output_queue; diff --git a/net/core/dev.c b/net/core/dev.c index 4f1ae2efe872..3156df699f01 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,18 +115,6 @@ #endif /* CONFIG_NET_RADIO */ #include -/* This define, if set, will randomly drop a packet when congestion - * is more than moderate. It helps fairness in the multi-interface - * case when one of them is a hog, but it kills performance for the - * single interface case so it is off now by default. - */ -#undef RAND_LIE - -/* Setting this will sample the queue lengths and thus congestion - * via a timer instead of as each packet is received. - */ -#undef OFFLINE_SAMPLE - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); -#endif - /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -1365,69 +1348,10 @@ out: int netdev_max_backlog = 300; int weight_p = 64; /* old backlog weight */ -/* These numbers are selected based on intuition and some - * experimentatiom, if you have more scientific way of doing this - * please go ahead and fix things. - */ -int no_cong_thresh = 10; -int no_cong = 20; -int lo_cong = 100; -int mod_cong = 290; DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; -static void get_sample_stats(int cpu) -{ -#ifdef RAND_LIE - unsigned long rd; - int rq; -#endif - struct softnet_data *sd = &per_cpu(softnet_data, cpu); - int blog = sd->input_pkt_queue.qlen; - int avg_blog = sd->avg_blog; - - avg_blog = (avg_blog >> 1) + (blog >> 1); - - if (avg_blog > mod_cong) { - /* Above moderate congestion levels. */ - sd->cng_level = NET_RX_CN_HIGH; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_DROP; -#endif - } else if (avg_blog > lo_cong) { - sd->cng_level = NET_RX_CN_MOD; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_CN_HIGH; -#endif - } else if (avg_blog > no_cong) - sd->cng_level = NET_RX_CN_LOW; - else /* no congestion */ - sd->cng_level = NET_RX_SUCCESS; - - sd->avg_blog = avg_blog; -} - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy) -{ -/* 10 ms 0r 1ms -- i don't care -- JHS */ - int next_tick = 1; - int cpu = smp_processor_id(); - - get_sample_stats(cpu); - next_tick += jiffies; - mod_timer(&samp_timer, next_tick); -} -#endif - - /** * netif_rx - post buffer to the network code * @skb: buffer to post @@ -1476,11 +1400,8 @@ int netif_rx(struct sk_buff *skb) enqueue: dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue, skb); -#ifndef OFFLINE_SAMPLE - get_sample_stats(this_cpu); -#endif local_irq_restore(flags); - return queue->cng_level; + return NET_RX_SUCCESS; } if (queue->throttle) @@ -3300,8 +3221,6 @@ static int __init net_dev_init(void) queue = &per_cpu(softnet_data, i); skb_queue_head_init(&queue->input_pkt_queue); queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); set_bit(__LINK_STATE_START, &queue->backlog_dev.state); @@ -3310,11 +3229,6 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } -#ifdef OFFLINE_SAMPLE - samp_timer.expires = jiffies + (10 * HZ); - add_timer(&samp_timer); -#endif - dev_boot_phase = 0; open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 76e9987474ca..fff63643a35c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -14,10 +14,6 @@ extern int netdev_max_backlog; extern int weight_p; -extern int no_cong_thresh; -extern int no_cong; -extern int lo_cong; -extern int mod_cong; extern int net_msg_cost; extern int net_msg_burst; @@ -84,38 +80,6 @@ ctl_table core_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, - { - .ctl_name = NET_CORE_NO_CONG_THRESH, - .procname = "no_cong_thresh", - .data = &no_cong_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_NO_CONG, - .procname = "no_cong", - .data = &no_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_LO_CONG, - .procname = "lo_cong", - .data = &lo_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_MOD_CONG, - .procname = "mod_cong", - .data = &mod_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, { .ctl_name = NET_CORE_MSG_COST, .procname = "message_cost", -- cgit v1.2.3 From 31aa02c53c84658f6694f319f09e232ede27be5a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 20:12:48 -0700 Subject: [NET]: Eliminate netif_rx massive packet drops. Eliminate the throttling behaviour when the netif receive queue fills because it behaves badly when using high speed networks under load. The throttling cause multiple packet drops that cause TCP to go into slow start mode. The same effective patch has been part of BIC TCP and H-TCP as well as part of Web100. The existing code drops 100's of packets when the queue fills; this changes it to individual packet drop-tail. Signed-off-by: Stephen Hemmminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +--- net/core/dev.c | 21 ++------------------- 2 files changed, 3 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 718ad579c65c..3a0ed7f9e801 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -164,7 +164,6 @@ struct netif_rx_stats unsigned total; unsigned dropped; unsigned time_squeeze; - unsigned throttled; unsigned cpu_collision; }; @@ -557,10 +556,9 @@ static inline int unregister_gifconf(unsigned int family) struct softnet_data { - int throttle; + struct net_device *output_queue; struct sk_buff_head input_pkt_queue; struct list_head poll_list; - struct net_device *output_queue; struct sk_buff *completion_queue; struct net_device backlog_dev; /* Sorry. 8) */ diff --git a/net/core/dev.c b/net/core/dev.c index 3156df699f01..1a64508e527f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -198,7 +198,7 @@ static struct notifier_block *netdev_chain; * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; #ifdef CONFIG_SYSFS extern int netdev_sysfs_init(void); @@ -1372,7 +1372,6 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; int netif_rx(struct sk_buff *skb) { - int this_cpu; struct softnet_data *queue; unsigned long flags; @@ -1388,15 +1387,11 @@ int netif_rx(struct sk_buff *skb) * short when CPU is congested, but is still operating. */ local_irq_save(flags); - this_cpu = smp_processor_id(); queue = &__get_cpu_var(softnet_data); __get_cpu_var(netdev_rx_stat).total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; - enqueue: dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue, skb); @@ -1404,19 +1399,10 @@ enqueue: return NET_RX_SUCCESS; } - if (queue->throttle) - queue->throttle = 0; - netif_rx_schedule(&queue->backlog_dev); goto enqueue; } - if (!queue->throttle) { - queue->throttle = 1; - __get_cpu_var(netdev_rx_stat).throttled++; - } - -drop: __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); @@ -1701,8 +1687,6 @@ job_done: smp_mb__before_clear_bit(); netif_poll_enable(backlog_dev); - if (queue->throttle) - queue->throttle = 0; local_irq_enable(); return 0; } @@ -1976,7 +1960,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v) struct netif_rx_stats *s = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, s->throttled, + s->total, s->dropped, s->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ s->cpu_collision ); return 0; @@ -3220,7 +3204,6 @@ static int __init net_dev_init(void) queue = &per_cpu(softnet_data, i); skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); set_bit(__LINK_STATE_START, &queue->backlog_dev.state); -- cgit v1.2.3 From 51b0bdedb8e784d0d969a6b77151911130812400 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 20:14:40 -0700 Subject: [NET]: Separate two usages of netdev_max_backlog. Separate out the two uses of netdev_max_backlog. One controls the upper bound on packets processed per softirq, the new name for this is netdev_budget; the other controls the limit on packets queued via netif_rx. Increase the max_backlog default to account for faster processors. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/sysctl.h | 1 + net/core/dev.c | 6 +++--- net/core/sysctl_net_core.c | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 72965bfe6cfb..ebfe1250f0a4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -243,6 +243,7 @@ enum NET_CORE_MOD_CONG=16, NET_CORE_DEV_WEIGHT=17, NET_CORE_SOMAXCONN=18, + NET_CORE_BUDGET=19, }; /* /proc/sys/net/ethernet */ diff --git a/net/core/dev.c b/net/core/dev.c index 1a64508e527f..7016e0c36b3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1346,7 +1346,8 @@ out: Receiver routines =======================================================================*/ -int netdev_max_backlog = 300; +int netdev_max_backlog = 1000; +int netdev_budget = 300; int weight_p = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; @@ -1695,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h) { struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; - int budget = netdev_max_backlog; - + int budget = netdev_budget; local_irq_disable(); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index fff63643a35c..8f817ad9f546 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -13,6 +13,7 @@ #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; +extern int netdev_budget; extern int weight_p; extern int net_msg_cost; extern int net_msg_burst; @@ -124,6 +125,14 @@ ctl_table core_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = NET_CORE_BUDGET, + .procname = "netdev_budget", + .data = &netdev_budget, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; -- cgit v1.2.3 From 5f8ef48d240963093451bcf83df89f1a1364f51d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 20:37:36 -0700 Subject: [TCP]: Allow choosing TCP congestion control via sockopt. Allow using setsockopt to set TCP congestion control to use on a per socket basis. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 31 ++++++++++++++++++++++++++++++- net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 6 files changed, 79 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3ea75dd6640a..dfd93d03f5d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -127,6 +127,7 @@ enum { #define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ #define TCP_INFO 11 /* Information about this connection. */ #define TCP_QUICKACK 12 /* Block/reenable quick acks */ +#define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 diff --git a/include/net/tcp.h b/include/net/tcp.h index e427cf35915c..d04b21188ccb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1162,8 +1162,9 @@ extern void tcp_init_congestion_control(struct tcp_sock *tp); extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); +extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name); -extern struct tcp_congestion_ops tcp_reno; +extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, int flag); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3dbc8dc1263..882436da9a3a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, return tp->af_specific->setsockopt(sk, level, optname, optval, optlen); + /* This is a string value all the others are int's */ + if (optname == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_user(name, optval, + min(TCP_CA_NAME_MAX-1, optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = tcp_set_congestion_control(tp, name); + release_sock(sk); + return err; + } + if (optlen < sizeof(int)) return -EINVAL; @@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, case TCP_QUICKACK: val = !tp->ack.pingpong; break; + + case TCP_CONGESTION: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, TCP_CA_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, tp->ca_ops->name, len)) + return -EFAULT; + return 0; default: return -ENOPROTOOPT; }; @@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, extern void __skb_cb_too_small_for_tcp(int, int); -extern void tcpdiag_init(void); +extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 665394a63ae4..4970d10a7785 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; - list_for_each_entry(e, &tcp_cong_list, list) { + list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } @@ -77,6 +77,9 @@ void tcp_init_congestion_control(struct tcp_sock *tp) { struct tcp_congestion_ops *ca; + if (tp->ca_ops != &tcp_init_congestion_ops) + return; + rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { @@ -139,6 +142,34 @@ void tcp_get_default_congestion_control(char *name) rcu_read_unlock(); } +/* Change congestion control for socket */ +int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) +{ + struct tcp_congestion_ops *ca; + int err = 0; + + rcu_read_lock(); + ca = tcp_ca_find(name); + if (ca == tp->ca_ops) + goto out; + + if (!ca) + err = -ENOENT; + + else if (!try_module_get(ca->owner)) + err = -EBUSY; + + else { + tcp_cleanup_congestion_control(tp); + tp->ca_ops = ca; + if (tp->ca_ops->init) + tp->ca_ops->init(tp); + } + out: + rcu_read_unlock(); + return err; +} + /* * TCP Reno congestion control * This is special case used for fallback as well. @@ -192,4 +223,15 @@ struct tcp_congestion_ops tcp_reno = { .min_cwnd = tcp_reno_min_cwnd, }; -EXPORT_SYMBOL_GPL(tcp_reno); +/* Initial congestion control used (until SYN) + * really reno under another name so we can tell difference + * during tcp_set_default_congestion_control + */ +struct tcp_congestion_ops tcp_init_congestion_ops = { + .name = "", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; +EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9122814c13ad..ebf112347a97 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2048,7 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; - tp->ca_ops = &tcp_reno; + tp->ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fce56039b0e9..9dac7fdf4726 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - tp->ca_ops = &tcp_reno; + tp->ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); -- cgit v1.2.3 From 677e90eda3bd8cfde0b748daaa46476162a03950 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jun 2005 20:59:51 -0700 Subject: [NET]: Zerocopy sequential reading of skb data Implements sequential reading for both linear and non-linear skb data at zerocopy cost. The data is returned in chunks of arbitary length, therefore random access is not possible. Usage: from := 0 to := 128 state := undef data := undef len := undef consumed := 0 skb_prepare_seq_read(skb, from, to, &state) while (len = skb_seq_read(consumed, &data, &state)) != 0 do /* do something with 'data' of length 'len' */ if abort then /* abort read if we don't wait for * skb_seq_read() to return 0 */ skb_abort_seq_read(&state) return endif /* not necessary to consume all of 'len' */ consumed += len done Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++ net/core/skbuff.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d7c839a21842..171a37dff83a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -321,6 +321,24 @@ extern void skb_over_panic(struct sk_buff *skb, int len, extern void skb_under_panic(struct sk_buff *skb, int len, void *here); +struct skb_seq_state +{ + __u32 lower_offset; + __u32 upper_offset; + __u32 frag_idx; + __u32 stepped_offset; + struct sk_buff *root_skb; + struct sk_buff *cur_skb; + __u8 *frag_data; +}; + +extern void skb_prepare_seq_read(struct sk_buff *skb, + unsigned int from, unsigned int to, + struct skb_seq_state *st); +extern unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st); +extern void skb_abort_seq_read(struct skb_seq_state *st); + /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6d68c03bc051..d285f2f7e812 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1500,6 +1500,120 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; +} + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at &consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to &data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. &consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note: The size of each block of data returned can be arbitary, + * this limitation is the cost for zerocopy seqeuental + * reads of potentially non linear data. + * + * Note: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) + return 0; + +next_skb: + block_limit = skb_headlen(st->cur_skb); + + if (abs_offset < block_limit) { + *data = st->cur_skb->data + abs_offset; + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + block_limit = frag->size + st->stepped_offset; + + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_skb_frag(frag); + + *data = (u8 *) st->frag_data + frag->page_offset + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + st->frag_idx++; + st->stepped_offset += frag->size; + } + + if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } else if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + goto next_skb; + } + + return 0; +} + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_skb_frag(st->frag_data); +} + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -1538,3 +1652,6 @@ EXPORT_SYMBOL(skb_queue_tail); EXPORT_SYMBOL(skb_unlink); EXPORT_SYMBOL(skb_append); EXPORT_SYMBOL(skb_split); +EXPORT_SYMBOL(skb_prepare_seq_read); +EXPORT_SYMBOL(skb_seq_read); +EXPORT_SYMBOL(skb_abort_seq_read); -- cgit v1.2.3 From 3fc7e8a6d842f72d16d2623b1022814a635ab961 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jun 2005 21:00:17 -0700 Subject: [NET]: skb_find_text() - Find a text pattern in skb data Finds a pattern in the skb data according to the specified textsearch configuration. Use textsearch_next() to retrieve subsequent occurrences of the pattern. Returns the offset to the first occurrence or UINT_MAX if no match was found. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ net/core/skbuff.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 171a37dff83a..416a2e4024b2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #define HAVE_ALLOC_SKB /* For the drivers to know */ @@ -339,6 +340,10 @@ extern unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); extern void skb_abort_seq_read(struct skb_seq_state *st); +extern unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state); + /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d285f2f7e812..bb73b2190ec7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1614,6 +1614,45 @@ void skb_abort_seq_read(struct skb_seq_state *st) kunmap_skb_frag(st->frag_data); } +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * @state: uninitialized textsearch state variable + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state) +{ + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + + return textsearch_find(config, state); +} + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -1655,3 +1694,4 @@ EXPORT_SYMBOL(skb_split); EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); +EXPORT_SYMBOL(skb_find_text); -- cgit v1.2.3 From d675c989ed2d4ba23dff615330b04371aea83534 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jun 2005 21:00:58 -0700 Subject: [PKT_SCHED]: Packet classification based on textsearch (ematch) Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/pkt_cls.h | 1 + include/linux/rtnetlink.h | 7 +- include/linux/tc_ematch/tc_em_text.h | 19 +++++ net/sched/Kconfig | 11 +++ net/sched/Makefile | 1 + net/sched/em_text.c | 157 +++++++++++++++++++++++++++++++++++ 6 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 include/linux/tc_ematch/tc_em_text.h create mode 100644 net/sched/em_text.c (limited to 'net') diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index d2aa214d6803..25d2d67c1faf 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -408,6 +408,7 @@ enum TCF_EM_NBYTE, TCF_EM_U32, TCF_EM_META, + TCF_EM_TEXT, __TCF_EM_MAX }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e68dbf0bf579..d021888b58f1 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -892,10 +892,13 @@ extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const voi goto rtattr_failure; \ __rta_fill(skb, attrtype, attrlen, data); }) -#define RTA_PUT_NOHDR(skb, attrlen, data) \ +#define RTA_APPEND(skb, attrlen, data) \ ({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \ goto rtattr_failure; \ - memcpy(skb_put(skb, RTA_ALIGN(attrlen)), data, attrlen); }) + memcpy(skb_put(skb, attrlen), data, attrlen); }) + +#define RTA_PUT_NOHDR(skb, attrlen, data) \ + RTA_APPEND(skb, RTA_ALIGN(attrlen), data) #define RTA_PUT_U8(skb, attrtype, value) \ ({ u8 _tmp = (value); \ diff --git a/include/linux/tc_ematch/tc_em_text.h b/include/linux/tc_ematch/tc_em_text.h new file mode 100644 index 000000000000..7cd43e99c7f5 --- /dev/null +++ b/include/linux/tc_ematch/tc_em_text.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_TC_EM_TEXT_H +#define __LINUX_TC_EM_TEXT_H + +#include + +#define TC_EM_TEXT_ALGOSIZ 16 + +struct tcf_em_text +{ + char algo[TC_EM_TEXT_ALGOSIZ]; + __u16 from_offset; + __u16 to_offset; + __u16 pattern_len; + __u8 from_layer:4; + __u8 to_layer:4; + __u8 pad; +}; + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b22c9beb604d..95d9bc5d8621 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -449,6 +449,17 @@ config NET_EMATCH_META To compile this code as a module, choose M here: the module will be called em_meta. +config NET_EMATCH_TEXT + tristate "Textsearch" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be ablt to classify packets based on + textsearch comparisons. Please select the appropriate textsearch + algorithms in the Library section. + + To compile this code as a module, choose M here: the + module will be called em_text. + config NET_CLS_ACT bool "Packet ACTION" depends on EXPERIMENTAL && NET_CLS && NET_QOS diff --git a/net/sched/Makefile b/net/sched/Makefile index eb3fe583eba8..8f58cecd6266 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o obj-$(CONFIG_NET_EMATCH_META) += em_meta.o +obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o diff --git a/net/sched/em_text.c b/net/sched/em_text.c new file mode 100644 index 000000000000..873840d8d072 --- /dev/null +++ b/net/sched/em_text.c @@ -0,0 +1,157 @@ +/* + * net/sched/em_text.c Textsearch ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct text_match +{ + u16 from_offset; + u16 to_offset; + u8 from_layer; + u8 to_layer; + struct ts_config *config; +}; + +#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data) + +static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + int from, to; + struct ts_state state; + + from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + from += tm->from_offset; + + to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + to += tm->to_offset; + + return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; +} + +static int em_text_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + struct text_match *tm; + struct tcf_em_text *conf = data; + struct ts_config *ts_conf; + int flags = 0; + + printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset, + conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len); + + if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len)) + return -EINVAL; + + if (conf->from_layer > conf->to_layer) + return -EINVAL; + + if (conf->from_layer == conf->to_layer && + conf->from_offset > conf->to_offset) + return -EINVAL; + +retry: + ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf), + conf->pattern_len, GFP_KERNEL, flags); + + if (flags & TS_AUTOLOAD) + rtnl_lock(); + + if (IS_ERR(ts_conf)) { + if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) { + rtnl_unlock(); + flags |= TS_AUTOLOAD; + goto retry; + } else + return PTR_ERR(ts_conf); + } else if (flags & TS_AUTOLOAD) { + textsearch_destroy(ts_conf); + return -EAGAIN; + } + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm == NULL) { + textsearch_destroy(ts_conf); + return -ENOBUFS; + } + + tm->from_offset = conf->from_offset; + tm->to_offset = conf->to_offset; + tm->from_layer = conf->from_layer; + tm->to_layer = conf->to_layer; + tm->config = ts_conf; + + m->datalen = sizeof(*tm); + m->data = (unsigned long) tm; + + return 0; +} + +static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + textsearch_destroy(EM_TEXT_PRIV(m)->config); +} + +static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + struct tcf_em_text conf; + + strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + conf.from_offset = tm->from_offset; + conf.to_offset = tm->to_offset; + conf.from_layer = tm->from_layer; + conf.to_layer = tm->to_layer; + conf.pattern_len = textsearch_get_pattern_len(tm->config); + conf.pad = 0; + + RTA_PUT_NOHDR(skb, sizeof(conf), &conf); + RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config)); + return 0; + +rtattr_failure: + return -1; +} + +static struct tcf_ematch_ops em_text_ops = { + .kind = TCF_EM_TEXT, + .change = em_text_change, + .match = em_text_match, + .destroy = em_text_destroy, + .dump = em_text_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_text_ops.link) +}; + +static int __init init_em_text(void) +{ + return tcf_em_register(&em_text_ops); +} + +static void __exit exit_em_text(void) +{ + tcf_em_unregister(&em_text_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_text); +module_exit(exit_em_text); -- cgit v1.2.3 From f2d368fa3ef90f2159d9e542303901ebf68144dd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 23 Jun 2005 23:55:41 -0700 Subject: [PKT_SCHED]: Make NET_EMATCH_TEXT select TEXTSEARCh Signed-off-by: David S. Miller --- net/sched/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 95d9bc5d8621..447b89e556b1 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -452,6 +452,7 @@ config NET_EMATCH_META config NET_EMATCH_TEXT tristate "Textsearch" depends on NET_EMATCH + select TEXTSEARCH ---help--- Say Y here if you want to be ablt to classify packets based on textsearch comparisons. Please select the appropriate textsearch -- cgit v1.2.3