aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2023-09-19 10:26:11 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-09-19 17:26:45 -0400
commitf76c449ec0cb96ebeb4a110ba30ecb695b5a1823 (patch)
treea745f4ac2febf45d2d579d68efcaa18af65bf80f
parentf68e0ed16e93d3bb2515168aae3432351110e993 (diff)
downloadbcachefs-tools-f76c449ec0cb96ebeb4a110ba30ecb695b5a1823.tar.gz
bcachefs-tools: fix endian problems between bit spinlocks and futexes
bcachefs format on a big-endian (s390x) machine crashes down in the rhashtable code imported from the kernel. The reason this occurs lies within the rht_lock() -> bit_spin_lock() code, the latter of which casts bitmaps down to 32-bits to satisfy the requirements of the futex interface. The specific problem here is that a 64 -> 32 bit cast doesn't refer to the lower 8 bytes on a big endian machine, which means setting bit number 0 in the 32-bit map actually corresponds to bit 32 in the 64-bit map. The rhashtable code specifically uses bit zero of the bucket pointer for exclusion and uses native bitops elsewhere (i.e. __rht_ptr()) to identify NULL pointers. If bit 32 of the pointer is set by the locking code instead of bit 0, an otherwise NULL pointer looks like a non-NULL value and results in a segfault. The bit spinlock code imported by the kernel is originally intended to work with unsigned long. The kernel code is able to throttle the cpu directly when under lock contention, while the userspace implementation relies on the futex primitives to emulate reliable blocking. Since the futex interface introduces the 32-bit requirement, isolate the associated userspace hack to that particular code. Restore the native bitmap behavior of the bit spinlock code to address the rhashtable problem described above. Since this is not compatible with the futex interface, create a futex wrapper specifically to convert the native bitmap type to a 32-bit virtual address and mask value for the purposes of waiting/waking when under lock contention. Signed-off-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--include/linux/bit_spinlock.h62
1 files changed, 51 insertions, 11 deletions
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index 62b91afa..873f08c2 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -6,38 +6,78 @@
#include <linux/futex.h>
#include <urcu/futex.h>
+/*
+ * The futex wait op wants an explicit 32-bit address and value. If the bitmap
+ * used for the spinlock is 64-bit, cast down and pass the right 32-bit region
+ * for the in-kernel checks. The value is the copy that has already been read
+ * from the atomic op.
+ *
+ * The futex wake op interprets the value as the number of waiters to wake (up
+ * to INT_MAX), so pass that along directly.
+ */
+static inline void do_futex(int nr, unsigned long *addr, unsigned long v, int futex_flags)
+{
+ u32 *addr32 = (u32 *) addr;
+ u32 *v32 = (u32 *) &v;
+ int shift = 0;
+
+ futex_flags |= FUTEX_PRIVATE_FLAG;
+
+#if BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ shift = (nr >= 32) ? 1 : 0;
+#else
+ shift = (nr < 32) ? 1 : 0;
+#endif
+#endif
+ if (shift) {
+ addr32 += shift;
+ v32 += shift;
+ }
+ /*
+ * The shift to determine the futex address may have cast away a
+ * literal wake count value. The value is capped to INT_MAX and thus
+ * always in the low bytes of v regardless of bit nr. Copy in the wake
+ * count to whatever 32-bit range was selected.
+ */
+ if (futex_flags == FUTEX_WAKE_PRIVATE)
+ *v32 = (u32) v;
+ futex(addr32, futex_flags, *v32, NULL, NULL, 0);
+}
+
static inline void bit_spin_lock(int nr, unsigned long *_addr)
{
- u32 mask, *addr = ((u32 *) _addr) + (nr / 32), v;
+ unsigned long mask;
+ unsigned long *addr = _addr + (nr / BITS_PER_LONG);
+ unsigned long v;
- nr &= 31;
- mask = 1U << nr;
+ nr &= BITS_PER_LONG - 1;
+ mask = 1UL << nr;
while (1) {
v = __atomic_fetch_or(addr, mask, __ATOMIC_ACQUIRE);
if (!(v & mask))
break;
- futex(addr, FUTEX_WAIT|FUTEX_PRIVATE_FLAG, v, NULL, NULL, 0);
+ do_futex(nr, addr, v, FUTEX_WAIT);
}
}
static inline void bit_spin_wake(int nr, unsigned long *_addr)
{
- u32 *addr = ((u32 *) _addr) + (nr / 32);
-
- futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+ do_futex(nr, _addr, INT_MAX, FUTEX_WAKE);
}
static inline void bit_spin_unlock(int nr, unsigned long *_addr)
{
- u32 mask, *addr = ((u32 *) _addr) + (nr / 32);
+ unsigned long mask;
+ unsigned long *addr = _addr + (nr / BITS_PER_LONG);
- nr &= 31;
- mask = 1U << nr;
+ nr &= BITS_PER_LONG - 1;
+ mask = 1UL << nr;
__atomic_and_fetch(addr, ~mask, __ATOMIC_RELEASE);
- futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+ do_futex(nr, addr, INT_MAX, FUTEX_WAKE);
}
#endif /* __LINUX_BIT_SPINLOCK_H */