From a3b00f10da808bd4a354f890b551cba471082d0e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Apr 2024 14:52:13 -0700 Subject: objpool: enable inlining objpool_push() and objpool_pop() operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit objpool_push() and objpool_pop() are very performance-critical functions and can be called very frequently in kretprobe triggering path. As such, it makes sense to allow compiler to inline them completely to eliminate function calls overhead. Luckily, their logic is quite well isolated and doesn't have any sprawling dependencies. This patch moves both objpool_push() and objpool_pop() into include/linux/objpool.h and marks them as static inline functions, enabling inlining. To avoid anyone using internal helpers (objpool_try_get_slot, objpool_try_add_slot), rename them to use leading underscores. We used kretprobe microbenchmark from BPF selftests (bench trig-kprobe and trig-kprobe-multi benchmarks) running no-op BPF kretprobe/kretprobe.multi programs in a tight loop to evaluate the effect. BPF own overhead in this case is minimal and it mostly stresses the rest of in-kernel kretprobe infrastructure overhead. Results are in millions of calls per second. This is not super scientific, but shows the trend nevertheless. BEFORE ====== kretprobe : 9.794 ± 0.086M/s kretprobe-multi: 10.219 ± 0.032M/s AFTER ===== kretprobe : 9.937 ± 0.174M/s (+1.5%) kretprobe-multi: 10.440 ± 0.108M/s (+2.2%) Link: https://lore.kernel.org/all/20240424215214.3956041-2-andrii@kernel.org/ Cc: Matt (Qiang) Wu Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- lib/objpool.c | 100 ---------------------------------------------------------- 1 file changed, 100 deletions(-) (limited to 'lib') diff --git a/lib/objpool.c b/lib/objpool.c index cfdc02420884..f696308fc026 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, } EXPORT_SYMBOL_GPL(objpool_init); -/* adding object to slot, abort if the slot was already full */ -static inline int -objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - uint32_t head, tail; - - /* loading tail and head as a local snapshot, tail first */ - tail = READ_ONCE(slot->tail); - - do { - head = READ_ONCE(slot->head); - /* fault caught: something must be wrong */ - WARN_ON_ONCE(tail - head > pool->nr_objs); - } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); - - /* now the tail position is reserved for the given obj */ - WRITE_ONCE(slot->entries[tail & slot->mask], obj); - /* update sequence to make this obj available for pop() */ - smp_store_release(&slot->last, tail + 1); - - return 0; -} - -/* reclaim an object to object pool */ -int objpool_push(void *obj, struct objpool_head *pool) -{ - unsigned long flags; - int rc; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id()); - raw_local_irq_restore(flags); - - return rc; -} -EXPORT_SYMBOL_GPL(objpool_push); - -/* try to retrieve object from slot */ -static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - /* load head snapshot, other cpus may change it */ - uint32_t head = smp_load_acquire(&slot->head); - - while (head != READ_ONCE(slot->last)) { - void *obj; - - /* - * data visibility of 'last' and 'head' could be out of - * order since memory updating of 'last' and 'head' are - * performed in push() and pop() independently - * - * before any retrieving attempts, pop() must guarantee - * 'last' is behind 'head', that is to say, there must - * be available objects in slot, which could be ensured - * by condition 'last != head && last - head <= nr_objs' - * that is equivalent to 'last - head - 1 < nr_objs' as - * 'last' and 'head' are both unsigned int32 - */ - if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { - head = READ_ONCE(slot->head); - continue; - } - - /* obj must be retrieved before moving forward head */ - obj = READ_ONCE(slot->entries[head & slot->mask]); - - /* move head forward to mark it's consumption */ - if (try_cmpxchg_release(&slot->head, &head, head + 1)) - return obj; - } - - return NULL; -} - -/* allocate an object from object pool */ -void *objpool_pop(struct objpool_head *pool) -{ - void *obj = NULL; - unsigned long flags; - int i, cpu; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - - cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { - obj = objpool_try_get_slot(pool, cpu); - if (obj) - break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); - } - raw_local_irq_restore(flags); - - return obj; -} -EXPORT_SYMBOL_GPL(objpool_pop); - /* release whole objpool forcely */ void objpool_free(struct objpool_head *pool) { -- cgit v1.2.3 From 78d0b16127daa26d016c215a089ae330878291f7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Apr 2024 14:52:14 -0700 Subject: objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profiling shows that calling nr_possible_cpus() in objpool_pop() takes a noticeable amount of CPU (when profiled on 80-core machine), as we need to recalculate number of set bits in a CPU bit mask. This number can't change, so there is no point in paying the price for recalculating it. As such, cache this value in struct objpool_head and use it in objpool_pop(). On the other hand, cached pool->nr_cpus isn't necessary, as it's not used in hot path and is also a pretty trivial value to retrieve. So drop pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size of struct objpool_head remains the same, which is a nice bonus. Same BPF selftests benchmarks were used to evaluate the effect. Using changes in previous patch (inlining of objpool_pop/objpool_push) as baseline, here are the differences: BASELINE ======== kretprobe : 9.937 ± 0.174M/s kretprobe-multi: 10.440 ± 0.108M/s AFTER ===== kretprobe : 10.106 ± 0.120M/s (+1.7%) kretprobe-multi: 10.515 ± 0.180M/s (+0.7%) Link: https://lore.kernel.org/all/20240424215214.3956041-3-andrii@kernel.org/ Cc: Matt (Qiang) Wu Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- include/linux/objpool.h | 6 +++--- lib/objpool.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/include/linux/objpool.h b/include/linux/objpool.h index d8b1f7b91128..cb1758eaa2d3 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); * struct objpool_head - object pooling metadata * @obj_size: object size, aligned to sizeof(void *) * @nr_objs: total objs (to be pre-allocated with objpool) - * @nr_cpus: local copy of nr_cpu_ids + * @nr_possible_cpus: cached value of num_possible_cpus() * @capacity: max objs can be managed by one objpool_slot * @gfp: gfp flags for kmalloc & vmalloc * @ref: refcount of objpool @@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); struct objpool_head { int obj_size; int nr_objs; - int nr_cpus; + int nr_possible_cpus; int capacity; gfp_t gfp; refcount_t ref; @@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool) raw_local_irq_save(flags); cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { + for (i = 0; i < pool->nr_possible_cpus; i++) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; diff --git a/lib/objpool.c b/lib/objpool.c index f696308fc026..234f9d0bd081 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, { int i, cpu_count = 0; - for (i = 0; i < pool->nr_cpus; i++) { + for (i = 0; i < nr_cpu_ids; i++) { struct objpool_slot *slot; int nodes, size, rc; @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, continue; /* compute how many objects to be allocated with this slot */ - nodes = nr_objs / num_possible_cpus(); - if (cpu_count < (nr_objs % num_possible_cpus())) + nodes = nr_objs / pool->nr_possible_cpus; + if (cpu_count < (nr_objs % pool->nr_possible_cpus)) nodes++; cpu_count++; @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) if (!pool->cpu_slots) return; - for (i = 0; i < pool->nr_cpus; i++) + for (i = 0; i < nr_cpu_ids; i++) kvfree(pool->cpu_slots[i]); kfree(pool->cpu_slots); } @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, /* initialize objpool pool */ memset(pool, 0, sizeof(struct objpool_head)); - pool->nr_cpus = nr_cpu_ids; + pool->nr_possible_cpus = num_possible_cpus(); pool->obj_size = object_size; pool->capacity = capacity; pool->gfp = gfp & ~__GFP_ZERO; pool->context = context; pool->release = release; - slot_size = pool->nr_cpus * sizeof(struct objpool_slot); + slot_size = nr_cpu_ids * sizeof(struct objpool_slot); pool->cpu_slots = kzalloc(slot_size, pool->gfp); if (!pool->cpu_slots) return -ENOMEM; -- cgit v1.2.3