summaryrefslogtreecommitdiff
path: root/include/linux/rcuref.h
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2023-03-23 23:55:31 +0300
committerPeter Zijlstra <peterz@infradead.org>2023-03-28 11:39:29 +0300
commitee1ee6db07795d9637bc5e8993a8ddcf886541ef (patch)
treead190ded0b3bfa64df031302c1b8f79bf065652b /include/linux/rcuref.h
parente5ab9eff46b04c5a04778e40d7092fed3fda52ca (diff)
downloadlinux-ee1ee6db07795d9637bc5e8993a8ddcf886541ef.tar.xz
atomics: Provide rcuref - scalable reference counting
atomic_t based reference counting, including refcount_t, uses atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is implemented with a atomic_try_cmpxchg() loop. High contention of the reference count leads to retry loops and scales badly. There is nothing to improve on this implementation as the semantics have to be preserved. Provide rcuref as a scalable alternative solution which is suitable for RCU managed objects. Similar to refcount_t it comes with overflow and underflow detection and mitigation. rcuref treats the underlying atomic_t as an unsigned integer and partitions this space into zones: 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) 0x80000000 - 0xBFFFFFFF saturation zone 0xC0000000 - 0xFFFFFFFE dead zone 0xFFFFFFFF no reference rcuref_get() unconditionally increments the reference count with atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the reference count with atomic_add_negative_release(). This unconditional increment avoids the inc_not_zero() problem, but requires a more complex implementation on the put() side when the count drops from 0 to -1. When this transition is detected then it is attempted to mark the reference count dead, by setting it to the midpoint of the dead zone with a single atomic_cmpxchg_release() operation. This operation can fail due to a concurrent rcuref_get() elevating the reference count from -1 to 0 again. If the unconditional increment in rcuref_get() hits a reference count which is marked dead (or saturated) it will detect it after the fact and bring back the reference count to the midpoint of the respective zone. The zones provide enough tolerance which makes it practically impossible to escape from a zone. The racy implementation of rcuref_put() requires to protect rcuref_put() against a grace period ending in order to prevent a subtle use after free. As RCU is the only mechanism which allows to protect against that, it is not possible to fully replace the atomic_inc_not_zero() based implementation of refcount_t with this scheme. The final drop is slightly more expensive than the atomic_dec_return() counterpart, but that's not the case which this is optimized for. The optimization is on the high frequeunt get()/put() pairs and their scalability. The performance of an uncontended rcuref_get()/put() pair where the put() is not dropping the last reference is still on par with the plain atomic operations, while at the same time providing overflow and underflow detection and mitigation. The performance of rcuref compared to plain atomic_inc_not_zero() and atomic_dec_return() based reference counting under contention: - Micro benchmark: All CPUs running a increment/decrement loop on an elevated reference count, which means the 0 to -1 transition never happens. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.3X to 4.7X - Conversion of dst_entry::__refcnt to rcuref and testing with the localhost memtier/memcached benchmark. That benchmark shows the reference count contention prominently. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.1X to 2.6X over the previous fix for the false sharing issue vs. struct dst_entry::__refcnt. When memtier is run over a real 1Gb network connection, there is a small gain on top of the false sharing fix. The two changes combined result in a 2%-5% total gain for that networked test. Reported-by: Wangyang Guo <wangyang.guo@intel.com> Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
Diffstat (limited to 'include/linux/rcuref.h')
-rw-r--r--include/linux/rcuref.h155
1 files changed, 155 insertions, 0 deletions
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
new file mode 100644
index 000000000000..2c8bfd0f1b6b
--- /dev/null
+++ b/include/linux/rcuref.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RCUREF_H
+#define _LINUX_RCUREF_H
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/limits.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+
+#define RCUREF_ONEREF 0x00000000U
+#define RCUREF_MAXREF 0x7FFFFFFFU
+#define RCUREF_SATURATED 0xA0000000U
+#define RCUREF_RELEASED 0xC0000000U
+#define RCUREF_DEAD 0xE0000000U
+#define RCUREF_NOREF 0xFFFFFFFFU
+
+/**
+ * rcuref_init - Initialize a rcuref reference count with the given reference count
+ * @ref: Pointer to the reference count
+ * @cnt: The initial reference count typically '1'
+ */
+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
+{
+ atomic_set(&ref->refcnt, cnt - 1);
+}
+
+/**
+ * rcuref_read - Read the number of held reference counts of a rcuref
+ * @ref: Pointer to the reference count
+ *
+ * Return: The number of held references (0 ... N)
+ */
+static inline unsigned int rcuref_read(rcuref_t *ref)
+{
+ unsigned int c = atomic_read(&ref->refcnt);
+
+ /* Return 0 if within the DEAD zone. */
+ return c >= RCUREF_RELEASED ? 0 : c + 1;
+}
+
+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
+
+/**
+ * rcuref_get - Acquire one reference on a rcuref reference count
+ * @ref: Pointer to the reference count
+ *
+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See documentation in lib/rcuref.c
+ *
+ * Return:
+ * False if the attempt to acquire a reference failed. This happens
+ * when the last reference has been put already
+ *
+ * True if a reference was successfully acquired
+ */
+static inline __must_check bool rcuref_get(rcuref_t *ref)
+{
+ /*
+ * Unconditionally increase the reference count. The saturation and
+ * dead zones provide enough tolerance for this.
+ */
+ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
+ return true;
+
+ /* Handle the cases inside the saturation and dead zones */
+ return rcuref_get_slowpath(ref);
+}
+
+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
+
+/*
+ * Internal helper. Do not invoke directly.
+ */
+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
+ "suspicious rcuref_put_rcusafe() usage");
+ /*
+ * Unconditionally decrease the reference count. The saturation and
+ * dead zones provide enough tolerance for this.
+ */
+ if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
+ return false;
+
+ /*
+ * Handle the last reference drop and cases inside the saturation
+ * and dead zones.
+ */
+ return rcuref_put_slowpath(ref);
+}
+
+/**
+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
+ * @ref: Pointer to the reference count
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Can be invoked from contexts, which guarantee that no grace period can
+ * happen which would free the object concurrently if the decrement drops
+ * the last reference and the slowpath races against a concurrent get() and
+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
+ *
+ * Return:
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely release the
+ * object which is protected by the reference counter.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * release the protected object.
+ */
+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
+{
+ return __rcuref_put(ref);
+}
+
+/**
+ * rcuref_put -- Release one reference for a rcuref reference count
+ * @ref: Pointer to the reference count
+ *
+ * Can be invoked from any context.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return:
+ *
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely schedule the
+ * object, which is protected by the reference counter, for
+ * deconstruction.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * deconstruct the protected object.
+ */
+static inline __must_check bool rcuref_put(rcuref_t *ref)
+{
+ bool released;
+
+ preempt_disable();
+ released = __rcuref_put(ref);
+ preempt_enable();
+ return released;
+}
+
+#endif