aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQu Wenruo <wqu@suse.com>2023-08-22 07:01:29 +0800
committerZorro Lang <zlang@kernel.org>2023-08-25 22:22:28 +0800
commit79d409ce9e394ea9e5e1954052426f8334d3d5f6 (patch)
treeb563574beed6a0f24bc34414dd9ce1202cb8a364
parenta0c6958c0e36a1481cd751c828ab1aa15de6b934 (diff)
downloadxfstests-dev-79d409ce9e394ea9e5e1954052426f8334d3d5f6.tar.gz
fstests: fsstress: wait interrupted aio to finish
[BUG] There is a very low chance to hit data csum mismatch (caught by scrub) during test case btrfs/06[234567]. After some extra digging, it turns out that plain fsstress itself is enough to cause the problem: ``` workload() { mkfs.btrfs -f -m single -d single --csum sha256 $dev1 > /dev/null mount $dev1 $mnt #$fsstress -p 10 -n 1000 -w -d $mnt umount $mnt btrfs check --check-data-csum $dev1 || fail } runtime=1024 for (( i = 0; i < $runtime; i++ )); do echo "=== $i / $runtime ===" workload done ``` Inside a VM which has only 6 cores, above script can trigger with 1/20 possibility. [CAUSE] Locally I got a much smaller workload to reproduce: $fsstress -p 7 -n 50 -s 1691396493 -w -d $mnt -v > /tmp/fsstress With extra kernel trace_prinkt() on the buffered/direct writes. It turns out that the following direct write is always the cause: btrfs_do_write_iter: r/i=5/283 buffered fileoff=708608(709121) len=12288(7712) btrfs_do_write_iter: r/i=5/283 direct fileoff=8192(8192) len=73728(73728) <<<<< btrfs_do_write_iter: r/i=5/283 direct fileoff=589824(589824) len=16384(16384) With the involved byte number, it's easy to pin down the fsstress opeartion: 0/31: writev d0/f3[285 2 0 0 296 1457078] [709121,8,964] 0 0/32: chown d0/f2 308134/1763236 0 0/33: do_aio_rw - xfsctl(XFS_IOC_DIOINFO) d0/f2[285 2 308134 1763236 320 1457078] return 25, fallback to stat() 0/33: awrite - io_getevents failed -4 <<<< 0/34: dwrite - xfsctl(XFS_IOC_DIOINFO) d0/f3[285 2 308134 1763236 320 1457078] return 25, fallback to stat() Note the 0/33, when the data csum mismatch triggered, it always fail with -4 (-EINTR). It looks like with lucky enough concurrency, we can get to the following situation inside fsstress: Process A | Process B -----------------------------------+--------------------------------------- do_aio_rw() | |- io_sumit(); | |- io_get_events(); | | Returned -EINTR, but IO hasn't | | finished. | `- free(buf); | malloc(); | Got the same memory of @buf from | thread A. | Modify the memory | Now the buffer is changed while | still under IO This is the typical buffer modification during direct IO, which is going to cause csum mismatch for btrfs, and btrfs properly detects it. This is the direct cause of the problem. The root cause is that, io_uring would use signals to handle submission/completion of IOs. Thus io_uring operations would interrupt AIO operations, thus causing the above problem. [FIX] To fix the problem, we can just retry io_getevents() so that we can properly wait for the IO. This prevents us to modify the IO buffer before writeback really finishes. With this fixes, I can no longer reproduce the data corruption. Signed-off-by: Qu Wenruo <wqu@suse.com> Reviewed-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: Zorro Lang <zlang@kernel.org>
-rw-r--r--ltp/fsstress.c20
1 files changed, 18 insertions, 2 deletions
diff --git a/ltp/fsstress.c b/ltp/fsstress.c
index 6641a525fe..abe2874253 100644
--- a/ltp/fsstress.c
+++ b/ltp/fsstress.c
@@ -2072,6 +2072,22 @@ void inode_info(char *str, size_t sz, struct stat64 *s, int verbose)
(long long) s->st_blocks, (long long) s->st_size);
}
+#ifdef AIO
+static int io_get_single_event(struct io_event *event)
+{
+ int ret;
+
+ /*
+ * We can get -EINTR if competing with io_uring using signal
+ * based notifications. For that case, just retry the wait.
+ */
+ do {
+ ret = io_getevents(io_ctx, 1, 1, event, NULL);
+ } while (ret == -EINTR);
+ return ret;
+}
+#endif
+
void
afsync_f(opnum_t opno, long r)
{
@@ -2111,7 +2127,7 @@ afsync_f(opnum_t opno, long r)
close(fd);
return;
}
- if ((e = io_getevents(io_ctx, 1, 1, &event, NULL)) != 1) {
+ if ((e = io_get_single_event(&event)) != 1) {
if (v)
printf("%d/%lld: afsync - io_getevents failed %d\n",
procid, opno, e);
@@ -2223,7 +2239,7 @@ do_aio_rw(opnum_t opno, long r, int flags)
procid, opno, iswrite ? "awrite" : "aread", e);
goto aio_out;
}
- if ((e = io_getevents(io_ctx, 1, 1, &event, NULL)) != 1) {
+ if ((e = io_get_single_event(&event)) != 1) {
if (v)
printf("%d/%lld: %s - io_getevents failed %d\n",
procid, opno, iswrite ? "awrite" : "aread", e);