diff options
author | Coly Li <colyli@suse.de> | 2018-01-11 00:29:07 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2018-01-11 00:29:07 +0800 |
commit | d07c2c86cd49f0dff1028d53479339531a8aba87 (patch) | |
tree | ed34de86cb5294ee1cfde0a3bad066c218600d97 | |
parent | 3ba7b8393a7477aa4e3d7d93f749584a844ce42c (diff) | |
download | bcache-patches-d07c2c86cd49f0dff1028d53479339531a8aba87.tar.gz |
for-next: update v2 patch for bcache device failure.
17 files changed, 888 insertions, 848 deletions
diff --git a/for-next/v2-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch b/for-next/v2-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch deleted file mode 100644 index 16c789d..0000000 --- a/for-next/v2-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 02cd6111e6e305665b9b734b41d9e66735eefba5 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 20:32:58 +0800 -Subject: [PATCH v2 01/13] bcache: exit bch_writeback_thread() with proper task - state - -Kernel thread routine bch_writeback_thread() has the following code block, - -452 set_current_state(TASK_INTERRUPTIBLE); -453 -454 if (kthread_should_stop()) -455 return 0; -456 -457 schedule(); -458 continue; - -At line 452, its status is set to TASK_INTERRUPTIBLE, and at line 454 if -kthread_should_stop() is true, a "return 0" at line 455 will to function -kernel/kthread.c:kthread() and call do_exit(). - -It is not good to enter do_exit() with task state TASK_INTERRUPTIBLE, in -following code path might_sleep() is called and a warning message is -reported by __might_sleep(): "WARNING: do not call blocking ops when -!TASK_RUNNING; state=1 set at [xxxx]". - -Indeed it does not hurt when kernel thread exits with TASK_INTERRUPTIBLE -state, but this warning message scares users, makes them feel there might -be something risky with bcache and hurt their data. - -In this patch, TASK_INTERRUPTIBLE is set after kthread_should_stop(), -so writeback kernel thread can exist and enter do_exit() with -TASK_RUNNING state. Warning message from might_sleep() is removed. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/writeback.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 56a37884ca8b..a57149803df6 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -449,11 +449,11 @@ static int bch_writeback_thread(void *arg) - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); -- set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - return 0; - -+ set_current_state(TASK_INTERRUPTIBLE); - schedule(); - continue; - } --- -2.15.1 - diff --git a/for-next/v2-0001-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v2-0001-bcache-properly-set-task-state-in-bch_writeback_t.patch new file mode 100644 index 0000000..9defd10 --- /dev/null +++ b/for-next/v2-0001-bcache-properly-set-task-state-in-bch_writeback_t.patch @@ -0,0 +1,87 @@ +From 603a4e1966e353f4709d1352842c0b42aa1a74aa Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 8 Jan 2018 22:11:01 +0800 +Subject: [PATCH v2 01/11] bcache: properly set task state in + bch_writeback_thread() + +Kernel thread routine bch_writeback_thread() has the following code block, + +447 down_write(&dc->writeback_lock); +448~450 if (check conditions) { +451 up_write(&dc->writeback_lock); +452 set_current_state(TASK_INTERRUPTIBLE); +453 +454 if (kthread_should_stop()) +455 return 0; +456 +457 schedule(); +458 continue; +459 } + +If condition check is true, its task state is set to TASK_INTERRUPTIBLE +and call schedule() to wait for others to wake up it. + +There are 2 issues in current code, +1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if + another process changes the condition and call wake_up_process(dc-> + writeback_thread), then at line 452 task state is set back to + TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be + waken up. +2, At line 454 if kthread_should_stop() is true, writeback kernel thread + will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and + call do_exit(). It is not good to enter do_exit() with task state + TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a + warning message is reported by __might_sleep(): "WARNING: do not call + blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". + +For the first issue, task state should be set before condition checks. +Ineed because dc->writeback_lock is required when modifying all the +conditions, calling set_current_state() inside code block where dc-> +writeback_lock is hold is safe. But this is quite implicit, so I still move +set_current_state() before all the condition checks. + +For the second issue, frankley speaking it does not hurt when kernel thread +exits with TASK_INTERRUPTIBLE state, but this warning message scares users, +makes them feel there might be something risky with bcache and hurt their +data. Setting task state to TASK_RUNNING before returning fixes this +problem. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/writeback.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 61f24c04cebd..f2924cbb8cdc 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -541,18 +541,21 @@ static int bch_writeback_thread(void *arg) + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); ++ set_current_state(TASK_INTERRUPTIBLE); + if (!atomic_read(&dc->has_dirty) || + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); +- set_current_state(TASK_INTERRUPTIBLE); + +- if (kthread_should_stop()) ++ if (kthread_should_stop()) { ++ set_current_state(TASK_RUNNING); + return 0; ++ } + + schedule(); + continue; + } ++ set_current_state(TASK_RUNNING); + + searched_full_index = refill_dirty(dc); + +-- +2.15.1 + diff --git a/for-next/v2-0002-bcache-set-task-properly-in-allocator_wait.patch b/for-next/v2-0002-bcache-set-task-properly-in-allocator_wait.patch index 0ae9c81..5800840 100644 --- a/for-next/v2-0002-bcache-set-task-properly-in-allocator_wait.patch +++ b/for-next/v2-0002-bcache-set-task-properly-in-allocator_wait.patch @@ -1,14 +1,11 @@ -From 9eb34cfed6f7cf086a31d0e01f79548aaa82eab9 Mon Sep 17 00:00:00 2001 +From 2723ee397093520bd24c33e54e241ea754bcc5ef Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 22:37:11 +0800 -Subject: [PATCH v2 02/13] bcache: set task properly in allocator_wait() +Date: Mon, 8 Jan 2018 22:45:51 +0800 +Subject: [PATCH v2 02/11] bcache: set task properly in allocator_wait() Kernel thread routine bch_allocator_thread() references macro allocator_wait() to wait for a condition or quit to do_exit() -when kthread_should_stop() is true. - -Macro allocator_wait() has 2 issues in setting task state, let's -see its code piece, +when kthread_should_stop() is true. Here is the code block, 284 while (1) { \ 285 set_current_state(TASK_INTERRUPTIBLE); \ @@ -24,56 +21,41 @@ see its code piece, 295 } \ 296 __set_current_state(TASK_RUNNING); \ -1) At line 285, task state is set to TASK_INTERRUPTIBLE, if at line 290 +At line 285, task state is set to TASK_INTERRUPTIBLE, if at line 290 kthread_should_stop() is true, the kernel thread will terminate and return to kernel/kthread.s:kthread(), then calls do_exit() with TASK_INTERRUPTIBLE state. This is not a suggested behavior and a warning message will be reported by might_sleep() in do_exit() code path: "WARNING: do not call blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". -2) Because task state is set to TASK_INTERRUPTIBLE at line 285, when break -while-loop the task state has to be set back to TASK_RUNNING at line 296. -Indeed it is unncessary, if task state is set to TASK_INTERRUPTIBLE before -calling schedule() at line 293, we don't need to set the state back to -TASK_RUNNING at line 296 anymore. The reason is, allocator kthread is only -woken up by wake_up_process(), this routine makes sure the task state of -allocator kthread will be TASK_RUNNING after it returns from schedule() at -line 294 (see kernel/sched/core.c:try_to_wake_up() for more detailed -information). - -This patch fixes the above 2 issues by, -1) Setting TASK_INTERRUPTIBLE state just before calling schedule(). -2) Then setting TASK_RUNNING at line 296 is unnecessary, remove it. +This patch fixes this problem by setting task state to TASK_RUNNING if +kthread_should_stop() is true and before kernel thread returns back to +kernel/kthread.s:kthread(). Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Huijun Tang <tang.junhui@zte.com.cn> --- - drivers/md/bcache/alloc.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) + drivers/md/bcache/alloc.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index a0cc1bc6d884..48c002faf08d 100644 +index 6cc6c0f9c3a9..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c -@@ -282,7 +282,6 @@ static void invalidate_buckets(struct cache *ca) - #define allocator_wait(ca, cond) \ - do { \ - while (1) { \ -- set_current_state(TASK_INTERRUPTIBLE); \ - if (cond) \ +@@ -287,8 +287,10 @@ do { \ break; \ \ -@@ -290,10 +289,10 @@ do { \ - if (kthread_should_stop()) \ + mutex_unlock(&(ca)->set->bucket_lock); \ +- if (kthread_should_stop()) \ ++ if (kthread_should_stop()) { \ ++ set_current_state(TASK_RUNNING); \ return 0; \ ++ } \ \ -+ set_current_state(TASK_INTERRUPTIBLE); \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ - } \ -- __set_current_state(TASK_RUNNING); \ - } while (0) - - static int bch_allocator_push(struct cache *ca, long bucket) -- 2.15.1 diff --git a/for-next/v2-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v2-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch index 93f1461..b99c8e7 100644 --- a/for-next/v2-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ b/for-next/v2-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch @@ -1,7 +1,7 @@ -From d697858f6f515b4bacee984c82535cf2b896ace9 Mon Sep 17 00:00:00 2001 +From df0607cb405ac954d3cc6c4de3c904904a71ba7b Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Fri, 22 Dec 2017 16:37:17 +0800 -Subject: [PATCH v2 04/13] bcache: fix cached_dev->count usage for +Date: Mon, 8 Jan 2018 23:05:58 +0800 +Subject: [PATCH v2 03/11] bcache: fix cached_dev->count usage for bch_cache_set_error() When bcache metadata I/O fails, bcache will call bch_cache_set_error() @@ -17,17 +17,17 @@ entry /sys/fs/bcache/<uuid> still exits and the backing device also references it. This is not expected behavior. When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 + bch_cache_set_error() + bch_cache_set_unregister() + bch_cache_set_stop() + __cache_set_unregister() <- called as callback by calling + clousre_queue(&c->caching) + cache_set_flush() <- called as a callback when refcount + of cache_set->caching is 0 + cache_set_free() <- called as a callback when refcount + of catch_set->cl is 0 + bch_cache_set_release() <- called as a callback when refcount + of catch_set->kobj is 0 I find if kernel thread bch_writeback_thread() quits while-loop when kthread_should_stop() is true and searched_full_index is false, clousre @@ -73,7 +73,7 @@ Adding following code in bch_writeback_thread() does not work, return 0; } because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running + echo 1 > /sys/block/bcache<N>/bcache/writeback_running It is difficult to check whether backing device is dirty without race and extra lock. So the above modification will introduce potential refcount underflow in some conditions. @@ -90,17 +90,20 @@ errors or system reboot, cached_dev->count can always be used correctly. The patch is simple, but understanding how it works is quite complicated. Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Huijun Tang <tang.junhui@zte.com.cn> --- drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 10 +++++++--- + drivers/md/bcache/writeback.c | 11 ++++++++--- drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 7 insertions(+), 6 deletions(-) + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 064efd869017..5401d2356aa3 100644 +index 78fd04bf0534..805b6c37c063 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -1044,7 +1044,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); @@ -109,19 +112,19 @@ index 064efd869017..5401d2356aa3 100644 } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index a57149803df6..0789a9e18337 100644 +index f2924cbb8cdc..228271fe622b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c -@@ -451,7 +451,7 @@ static int bch_writeback_thread(void *arg) - up_write(&dc->writeback_lock); +@@ -549,7 +549,7 @@ static int bch_writeback_thread(void *arg) - if (kthread_should_stop()) + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); - return 0; + break; + } - set_current_state(TASK_INTERRUPTIBLE); schedule(); -@@ -463,7 +463,6 @@ static int bch_writeback_thread(void *arg) +@@ -562,7 +562,6 @@ static int bch_writeback_thread(void *arg) if (searched_full_index && RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { atomic_set(&dc->has_dirty, 0); @@ -129,16 +132,17 @@ index a57149803df6..0789a9e18337 100644 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, NULL); } -@@ -484,6 +483,8 @@ static int bch_writeback_thread(void *arg) +@@ -583,6 +582,9 @@ static int bch_writeback_thread(void *arg) } } ++ dc->writeback_thread = NULL; + cached_dev_put(dc); + return 0; } -@@ -547,10 +548,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) +@@ -646,10 +648,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) if (!dc->writeback_write_wq) return -ENOMEM; @@ -154,10 +158,10 @@ index a57149803df6..0789a9e18337 100644 schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 1d284f3d0363..aab21afe49cf 100644 +index f102b1f9bc51..665dd3184ab3 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h -@@ -92,8 +92,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) +@@ -95,8 +95,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { diff --git a/for-next/v2-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch b/for-next/v2-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch deleted file mode 100644 index 002212e..0000000 --- a/for-next/v2-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch +++ /dev/null @@ -1,119 +0,0 @@ -From fd33195d255d0f152d9e2b36032b1cc816ededb3 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 23:27:41 +0800 -Subject: [PATCH v2 03/13] bcache: reduce cache_set devices iteration by - devices_max_used - -Member devices of struct cache_set is used to reference all attached -bcache devices to this cache set. If it is treated as array of pointers, -size of devices[] is indicated by member nr_uuids of struct cache_set. - -nr_uuids is calculated in drivers/md/super.c:bch_cache_set_alloc(), - bucket_bytes(c) / sizeof(struct uuid_entry) -Bucket size is determined by user space tool "make-bcache", by default it -is 1024 sectors (defined in bcache-tools/make-bcache.c:main()). So default -nr_uuids value is 4096 from the above calculation. - -Every time when bcache code iterates bcache devices of a cache set, all -the 4096 pointers are checked even only 1 bcache device is attached to the -cache set, that's a wast of time and unncessary. - -This patch adds a member devices_max_used to struct cache_set. Its value -is 1 + the maximum used index of devices[] in a cache set. When iterating -all valid bcache devices of a cache set, use c->devices_max_used in -for-loop may reduce a lot of useless checking. - -Personally, my motivation of this patch is not for performance, I use it -in bcache debugging, which helps me to narrow down the scape to check -valid bcached devices of a cache set. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/btree.c | 2 +- - drivers/md/bcache/super.c | 9 ++++++--- - drivers/md/bcache/writeback.h | 2 +- - 4 files changed, 9 insertions(+), 5 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 843877e017e1..83c569942bd0 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -488,6 +488,7 @@ struct cache_set { - int caches_loaded; - - struct bcache_device **devices; -+ unsigned devices_max_used; - struct list_head cached_devs; - uint64_t cached_dev_sectors; - struct closure caching; -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 81e8dc3dbe5e..bf0d7978bc3d 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1678,7 +1678,7 @@ static void bch_btree_gc_finish(struct cache_set *c) - - /* don't reclaim buckets to which writeback keys point */ - rcu_read_lock(); -- for (i = 0; i < c->nr_uuids; i++) { -+ for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - struct cached_dev *dc; - struct keybuf_key *w, *n; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index b4d28928dec5..064efd869017 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -721,6 +721,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, - d->c = c; - c->devices[id] = d; - -+ if (id >= c->devices_max_used) -+ c->devices_max_used = id + 1; -+ - closure_get(&c->caching); - } - -@@ -1261,7 +1264,7 @@ static int flash_devs_run(struct cache_set *c) - struct uuid_entry *u; - - for (u = c->uuids; -- u < c->uuids + c->nr_uuids && !ret; -+ u < c->uuids + c->devices_max_used && !ret; - u++) - if (UUID_FLASH_ONLY(u)) - ret = flash_dev_run(c, u); -@@ -1427,7 +1430,7 @@ static void __cache_set_unregister(struct closure *cl) - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->nr_uuids; i++) -+ for (i = 0; i < c->devices_max_used; i++) - if (c->devices[i]) { - if (!UUID_FLASH_ONLY(&c->uuids[i]) && - test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -@@ -1490,7 +1493,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->bucket_bits = ilog2(sb->bucket_size); - c->block_bits = ilog2(sb->block_size); - c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); -- -+ c->devices_max_used = 0; - c->btree_pages = bucket_pages(c); - if (c->btree_pages > BTREE_MAX_PAGES) - c->btree_pages = max_t(int, c->btree_pages / 4, -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index a9e3ffb4b03c..1d284f3d0363 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -21,7 +21,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->nr_uuids; i++) { -+ for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - - if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) --- -2.15.1 - diff --git a/for-next/v2-0004-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v2-0004-bcache-stop-dc-writeback_rate_update-properly.patch new file mode 100644 index 0000000..2b28dc3 --- /dev/null +++ b/for-next/v2-0004-bcache-stop-dc-writeback_rate_update-properly.patch @@ -0,0 +1,224 @@ +From 41e9ba093c2394bb2ecfac36ff1ca0cb38050f8d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 9 Jan 2018 21:46:21 +0800 +Subject: [PATCH v2 04/11] bcache: stop dc->writeback_rate_update properly + +struct delayed_work writeback_rate_update in struct cache_dev is a delayed +worker to call function update_writeback_rate() in period (the interval is +defined by dc->writeback_rate_update_seconds). + +When a metadate I/O error happens on cache device, bcache error handling +routine bch_cache_set_error() will call bch_cache_set_unregister() to +retire whole cache set. On the unregister code path, this delayed work is +stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). + +dc->writeback_rate_update is a special delayed work from others in bcache. +In its routine update_writeback_rate(), this delayed work is re-armed +itself. That means when cancel_delayed_work_sync() returns, this delayed +work can still be executed after several seconds defined by +dc->writeback_rate_update_seconds. + +The problem is, after cancel_delayed_work_sync() returns, the cache set +unregister code path will continue and release memory of struct cache set. +Then the delayed work is scheduled to run, __update_writeback_rate() +will reference the already released cache_set memory, and trigger a NULL +pointer deference fault. + +This patch moves cancel_delayed_work_sync() from cached_dev_detach_finish() +to bch_cached_dev_detach(), the latter one is called before +cache_set_flush() gets called, we don't need to worry that cache_set memory +is freed during __update_writeback_rate() is executing. + +Here is how this patch solves the access-after-freed problem. +- Hold a cached_dev refcount. + When first time to schedule dc->writeback_rate_update, cached_dev_get() is +called to hold a refcount of cached_dev. Then if this delayed work re-arms +itself, the holding refcount will make sure cached_dev_detach_finish() +won't be called and all objects referenced by __update_writeback_rate() +won't be freed. +- Moves cancel_delayed_work_sync() to bch_cached_dev_detach() + Currently cancel_delayed_work_sync() is called in +cached_dev_detach_finish(), since a refcount is taken when the first time +dc->writeback_rate_update is schedule to run, if this refcount is not +dropped by stopping the delayed work, cached_dev_detach_finish() will not +be executed, this is a deadlock. Therefore cancel_delayed_work_sync() is +moved to bch_cached_dev_detach(), and everything will be OK. +- Add BCACHE_DEV_WB_RUNNING to bcache device flags + Set the flag before first time scheduling dc->writeback_rate_update, and +clear the bit before calling cancel_delayed_work_sync(). Then inside +update_writeback_rate() if BCACHE_DEV_WB_RUNNING is clear, it means this +delayed work should call cached_dev_put() and quit. + If BCACHE_DEV_WB_RUNNING is set and cancel_delayed_work() is called +outside after test_bit() and before schedule_delayed_work() in +update_writeback_rate(), the delayed work will be re-armed while holding +a refcount of cached_dev. So when next time it is scheduled to run, +all referenced memory objects are still there and safe to access. And in +next run update_writeback_rate() will find BCACHE_DEV_WB_RUNNING is set, +it will call cached_dev_put() and quit in safe. + +If dc->writeback_thread quits before dc->writeback_rate_update, calling +cached_dev_put() in update_writeback_rate() will drop last refcount of +cached_dev, then cached_dev_detach_finish() will be called, then in +bcache_device_detach() last refcount of closure cl->caching will be dropped +and cache_set_flush() will be called. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 22 +++++++++++++++------- + drivers/md/bcache/sysfs.c | 10 +++++++++- + drivers/md/bcache/writeback.c | 26 ++++++++++++++++++++++++++ + 4 files changed, 51 insertions(+), 9 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 5e2d4e80198e..147413ae5890 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -261,7 +261,7 @@ struct bcache_device { + #define BCACHE_DEV_CLOSING 0 + #define BCACHE_DEV_DETACHING 1 + #define BCACHE_DEV_UNLINK_DONE 2 +- ++#define BCACHE_DEV_WB_RUNNING 4 + unsigned nr_stripes; + unsigned stripe_size; + atomic_t *stripe_sectors_dirty; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 805b6c37c063..7aa5786fcd9e 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -911,12 +911,6 @@ static void cached_dev_detach_finish(struct work_struct *w) + + mutex_lock(&bch_register_lock); + +- cancel_delayed_work_sync(&dc->writeback_rate_update); +- if (!IS_ERR_OR_NULL(dc->writeback_thread)) { +- kthread_stop(dc->writeback_thread); +- dc->writeback_thread = NULL; +- } +- + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + +@@ -954,6 +948,19 @@ void bch_cached_dev_detach(struct cached_dev *dc) + closure_get(&dc->disk.cl); + + bch_writeback_queue(dc); ++ ++ /* ++ * Stop dc->writeback_rate_update before cache_set_flush gets called, ++ * to make sure cache set memory won't be freed during routine ++ * update_writeback_rate() is executing. ++ */ ++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cancel_delayed_work_sync(&dc->writeback_rate_update); ++ if (!IS_ERR_OR_NULL(dc->writeback_thread)) { ++ kthread_stop(dc->writeback_thread); ++ dc->writeback_thread = NULL; ++ } ++ + cached_dev_put(dc); + } + +@@ -1079,7 +1086,8 @@ static void cached_dev_free(struct closure *cl) + { + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + +- cancel_delayed_work_sync(&dc->writeback_rate_update); ++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cancel_delayed_work_sync(&dc->writeback_rate_update); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index b4184092c727..dcbeda994063 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -300,9 +300,17 @@ STORE(bch_cached_dev) + if (attr == &sysfs_writeback_running) + bch_writeback_queue(dc); + +- if (attr == &sysfs_writeback_percent) ++ if (attr == &sysfs_writeback_percent) { ++ /* ++ * If this is the first time to run dc->writeback_rate_update, ++ * get a cached_dev refcount. This refcount will be dropped inside ++ * update_writeback_rate(). ++ */ ++ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cached_dev_get(dc); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); ++ } + + mutex_unlock(&bch_register_lock); + return size; +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 228271fe622b..0ec23987f098 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -86,11 +86,24 @@ static void __update_writeback_rate(struct cached_dev *dc) + dc->writeback_rate_target = target; + } + ++ ++/* ++ * BCACHE_DEV_WB_RUNNING is set before schedule dc->writeback_rate_update ++ * to run, and cached_dev_get() is also called where this bit is set. ++ * If BCACHE_DEV_WB_RUNNING is clear here, it means cancel_delayed_work_sync() ++ * is called to stop this delayed work. So it won't re-arm itself, and drop ++ * cached_dev refcount by cache_dev_put() and quit immediately. ++ */ ++ + static void update_writeback_rate(struct work_struct *work) + { + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ cached_dev_put(dc); ++ return; ++ } + + down_read(&dc->writeback_lock); + +@@ -100,6 +113,11 @@ static void update_writeback_rate(struct work_struct *work) + + up_read(&dc->writeback_lock); + ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ cached_dev_put(dc); ++ return; ++ } ++ + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } +@@ -638,6 +656,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; + ++ BUG_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); + } + +@@ -656,6 +675,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) + return PTR_ERR(dc->writeback_thread); + } + ++ /* ++ * If dc->writeback_rate_update is first time to run, get a ++ * cached_dev refcount. This refcount will be dropped inside ++ * update_writeback_rate(). ++ */ ++ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cached_dev_get(dc); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + +-- +2.15.1 + diff --git a/for-next/v2-0007-bcache-set-error_limit-correctly.patch b/for-next/v2-0005-bcache-set-error_limit-correctly.patch index 10f0217..4153421 100644 --- a/for-next/v2-0007-bcache-set-error_limit-correctly.patch +++ b/for-next/v2-0005-bcache-set-error_limit-correctly.patch @@ -1,7 +1,7 @@ -From 93e9a82ee54b8fb1e50c4df95a74ab2953aec9ff Mon Sep 17 00:00:00 2001 +From 1cd65ce62d9e25bbad99c48afcca28229bca680d Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 20:37:27 +0800 -Subject: [PATCH v2 07/13] bcache: set error_limit correctly +Date: Tue, 9 Jan 2018 22:46:25 +0800 +Subject: [PATCH v2 05/11] bcache: set error_limit correctly Struct cache uses io_errors for two purposes, - Error decay: when cache set error_decay is set, io_errors is used to @@ -56,6 +56,9 @@ the default io error limit value makes sense, it is enough for most of cache devices. Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Huijun Tang <tang.junhui@zte.com.cn> --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 +- @@ -63,10 +66,10 @@ Signed-off-by: Coly Li <colyli@suse.de> 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 395b87942a2f..a31dc3737dae 100644 +index 147413ae5890..9d18a02db723 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h -@@ -654,6 +654,7 @@ struct cache_set { +@@ -662,6 +662,7 @@ struct cache_set { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, } on_error; @@ -75,10 +78,10 @@ index 395b87942a2f..a31dc3737dae 100644 unsigned error_decay; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 8912be4165c5..02d9d7110769 100644 +index 7aa5786fcd9e..858d3dc4dbc8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -1561,7 +1561,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) +@@ -1560,7 +1560,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; @@ -88,10 +91,10 @@ index 8912be4165c5..02d9d7110769 100644 return c; err: diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b4184092c727..d7ce9a05b304 100644 +index dcbeda994063..8a89c3c4bdab 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c -@@ -556,7 +556,7 @@ SHOW(__bch_cache_set) +@@ -564,7 +564,7 @@ SHOW(__bch_cache_set) /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); @@ -100,7 +103,7 @@ index b4184092c727..d7ce9a05b304 100644 sysfs_hprint(congested, ((uint64_t) bch_get_congested(c)) << 9); -@@ -656,7 +656,7 @@ STORE(__bch_cache_set) +@@ -664,7 +664,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_io_error_limit) diff --git a/for-next/v2-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch b/for-next/v2-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch deleted file mode 100644 index 489dee1..0000000 --- a/for-next/v2-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 1a9aae02c180b47b2ae2ef9c61915b2b694d1fc2 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 23 Dec 2017 01:50:19 +0800 -Subject: [PATCH v2 05/13] bcache: stop dc->writeback_rate_update if cache set - is stopping - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, cached_dev_free() -calls cancel_delayed_work_sync(&dc->writeback_rate_update) to stop this -delayed work. - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -after a piece of time. That means when cancel_delayed_work_sync() returns, -this delayed work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will eventually release memory of struct cache set. -Then the delayed work is scheduled to run, and inside its rountine -update_writeback_rate() that already released cache set NULL pointer will -be accessed. Now a NULL pointer deference panic is triggered. - -In order to avoid the above problem, this patch checks cache set flags in -delayed work routine update_writeback_rate(). If flag CACHE_SET_STOPPING -is set, this routine will quit without re-arm the delayed work. Then the -NULL pointer deference panic won't happen after cache set is released. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/writeback.c | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 0789a9e18337..745d9b2a326f 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -91,6 +91,11 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; -+ -+ /* quit directly if cache set is stopping */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ return; - - down_read(&dc->writeback_lock); - -@@ -100,6 +105,10 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -+ /* do not schedule delayed work if cache set is stopping */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ return; -+ - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } --- -2.15.1 - diff --git a/for-next/v2-0009-bcache-add-io_disable-to-struct-cache_set.patch b/for-next/v2-0006-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch index eaaeb51..00f826d 100644 --- a/for-next/v2-0009-bcache-add-io_disable-to-struct-cache_set.patch +++ b/for-next/v2-0006-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch @@ -1,7 +1,8 @@ -From 5996e95d633ad28ebbd113004efc488162cd22b7 Mon Sep 17 00:00:00 2001 +From 374b72e439ddac280759ea97ba934272d2978698 Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Tue, 2 Jan 2018 17:31:07 +0800 -Subject: [PATCH v2 09/13] bcache: add io_disable to struct cache_set +Date: Wed, 10 Jan 2018 00:18:01 +0800 +Subject: [PATCH v2 06/11] bcache: add CACHE_SET_IO_DISABLE to struct cache_set + flags When too many I/Os failed on cache device, bch_cache_set_error() is called in the error handling code path to retire whole problematic cache set. If @@ -16,60 +17,76 @@ prevent the cache set from being retired. The solution in this patch is, to add per cache set flag to disable I/O request on this cache and all attached backing devices. Then new coming I/O requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when io_disable is -true. +threads and self-armed kernel worker can stop very fast when flags bit +CACHE_SET_IO_DISABLE is set. Because bcache also do internal I/Os for writeback, garbage collection, bucket allocation, journaling, this kind of I/O should be disabled after bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether cache_set->io_disable is true. If cache_set->io_disable is -true, closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and +check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, +closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and return, generic_make_request() won't be called. -A sysfs interface is also added for cache_set->io_disable, to read and set -io_disable value for debugging. It is helpful to trigger more corner case -issues for failed cache device. +A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit +from cache_set->flags, to disable or enable cache set I/O for debugging. It +is helpful to trigger more corner case issues for failed cache device. + +Changelog +v2, use cache_set->flags to set io disable bit, suggested by Junhui Tang. +v1, initial version. Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> --- - drivers/md/bcache/alloc.c | 2 +- - drivers/md/bcache/bcache.h | 14 ++++++++++++++ - drivers/md/bcache/btree.c | 6 ++++-- + drivers/md/bcache/alloc.c | 3 ++- + drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ + drivers/md/bcache/btree.c | 7 +++++-- drivers/md/bcache/io.c | 2 +- drivers/md/bcache/journal.c | 4 ++-- drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 7 ++++++- - drivers/md/bcache/sysfs.c | 4 ++++ + drivers/md/bcache/super.c | 6 +++++- + drivers/md/bcache/sysfs.c | 19 +++++++++++++++++++ drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 34 ++++++++++++++++++++++------------ - 10 files changed, 73 insertions(+), 32 deletions(-) + drivers/md/bcache/writeback.c | 36 ++++++++++++++++++++++++++++-------- + 10 files changed, 99 insertions(+), 28 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 48c002faf08d..3be737582f27 100644 +index 458e1d38577d..004cc3cc6123 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c -@@ -286,7 +286,7 @@ do { \ +@@ -287,7 +287,8 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) \ -+ if (kthread_should_stop() || ca->set->io_disable) \ +- if (kthread_should_stop()) { \ ++ if (kthread_should_stop() || \ ++ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ + set_current_state(TASK_RUNNING); \ return 0; \ - \ - set_current_state(TASK_INTERRUPTIBLE); \ + } \ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c53f312b2216..9c7f9b1cb791 100644 +index 9d18a02db723..ab51984d12f6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h -@@ -481,6 +481,7 @@ struct cache_set { - struct cache_accounting accounting; - - unsigned long flags; -+ bool io_disable; - - struct cache_sb sb; - -@@ -853,6 +854,19 @@ static inline void wake_up_allocators(struct cache_set *c) +@@ -474,10 +474,15 @@ struct gc_stat { + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. ++ * ++ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all ++ * external and internal I/O should be denied when this flag is set. ++ * + */ + #define CACHE_SET_UNREGISTERING 0 + #define CACHE_SET_STOPPING 1 + #define CACHE_SET_RUNNING 2 ++#define CACHE_SET_IO_DISABLE 4 + + struct cache_set { + struct closure cl; +@@ -861,6 +866,19 @@ static inline void wake_up_allocators(struct cache_set *c) wake_up_process(ca->alloc_thread); } @@ -78,7 +95,7 @@ index c53f312b2216..9c7f9b1cb791 100644 + struct closure *cl) +{ + closure_get(cl); -+ if (unlikely(c->io_disable)) { ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; @@ -90,20 +107,21 @@ index c53f312b2216..9c7f9b1cb791 100644 void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index bf0d7978bc3d..75470cce1177 100644 +index 208fd3b61add..7011fb75258c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c -@@ -1788,9 +1788,11 @@ static int bch_gc_thread(void *arg) +@@ -1788,9 +1788,12 @@ static int bch_gc_thread(void *arg) while (1) { wait_event_interruptible(c->gc_wait, - kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ c->io_disable || -+ gc_should_run(c)); ++ kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || ++ gc_should_run(c)); - if (kthread_should_stop()) -+ if (kthread_should_stop() || c->io_disable) ++ if (kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) break; set_gc_sectors(c); @@ -143,10 +161,10 @@ index a87165c1d8e5..979873641030 100644 continue_at(cl, journal_write_done, NULL); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 643c3021624f..a85d6a605a8e 100644 +index dd8191cfdb14..12f5b95b3734 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c -@@ -725,7 +725,7 @@ static void cached_dev_read_error(struct closure *cl) +@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) /* XXX: invalidate cache */ @@ -155,7 +173,7 @@ index 643c3021624f..a85d6a605a8e 100644 } continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -850,7 +850,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, +@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); @@ -164,7 +182,7 @@ index 643c3021624f..a85d6a605a8e 100644 return ret; out_put: -@@ -858,7 +858,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, +@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_submit: miss->bi_end_io = request_endio; miss->bi_private = &s->cl; @@ -173,7 +191,7 @@ index 643c3021624f..a85d6a605a8e 100644 return ret; } -@@ -923,7 +923,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) +@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) if ((bio_op(bio) != REQ_OP_DISCARD) || blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -182,7 +200,7 @@ index 643c3021624f..a85d6a605a8e 100644 } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; -@@ -938,12 +938,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) +@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; @@ -197,7 +215,7 @@ index 643c3021624f..a85d6a605a8e 100644 } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -959,7 +959,7 @@ static void cached_dev_nodata(struct closure *cl) +@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ @@ -206,24 +224,24 @@ index 643c3021624f..a85d6a605a8e 100644 continue_at(cl, cached_dev_bio_complete, NULL); } -@@ -974,6 +974,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, +@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); -+ if (unlikely(d->c && d->c->io_disable)) { ++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + + atomic_set(&dc->backing_idle, 0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - bio_set_dev(bio, dc->bdev); -@@ -1089,6 +1095,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, +@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct bcache_device *d = bio->bi_disk->private_data; int rw = bio_data_dir(bio); -+ if (unlikely(d->c->io_disable)) { ++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; @@ -233,7 +251,7 @@ index 643c3021624f..a85d6a605a8e 100644 s = search_alloc(bio, d); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index bbe911847eea..7aa76c3e3556 100644 +index 858d3dc4dbc8..32b64090a056 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, @@ -245,27 +263,26 @@ index bbe911847eea..7aa76c3e3556 100644 closure_sync(cl); } -@@ -1333,6 +1333,10 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - acquire_console_sem(); - */ +@@ -1326,6 +1326,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) + test_bit(CACHE_SET_STOPPING, &c->flags)) + return false; -+ c->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); ++ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ pr_warn("bcache: CACHE_SET_IO_DISABLE already set"); + - printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); - - va_start(args, fmt); -@@ -1564,6 +1568,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + /* XXX: we can be called from atomic context + acquire_console_sem(); + */ +@@ -1561,6 +1564,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ c->io_disable = false; ++ BUG_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; err: diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index d7ce9a05b304..acce7c82e111 100644 +index 8a89c3c4bdab..fd04450b5f13 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -92,6 +92,7 @@ read_attribute(partial_stripes_expensive); @@ -276,23 +293,38 @@ index d7ce9a05b304..acce7c82e111 100644 rw_attribute(discard); rw_attribute(running); rw_attribute(label); -@@ -573,6 +574,7 @@ SHOW(__bch_cache_set) +@@ -581,6 +582,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", c->io_disable); ++ sysfs_printf(io_disable, "%i", ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); if (attr == &sysfs_bset_tree_stats) return bch_bset_print_stats(c, buf); -@@ -663,6 +665,7 @@ STORE(__bch_cache_set) +@@ -670,6 +673,21 @@ STORE(__bch_cache_set) + if (attr == &sysfs_io_error_halflife) c->error_decay = strtoul_or_return(buf) / 88; ++ if (attr == &sysfs_io_disable) { ++ int v = strtoul_or_return(buf); ++ if (v) { ++ if (test_and_set_bit(CACHE_SET_IO_DISABLE, ++ &c->flags)) ++ pr_warn("bcache: CACHE_SET_IO_DISABLE" ++ " already set"); ++ } else { ++ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, ++ &c->flags)) ++ pr_warn("bcache: CACHE_SET_IO_DISABLE" ++ " already cleared"); ++ } ++ } ++ sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); -+ sysfs_strtoul_clamp(io_disable, c->io_disable, 0, 1); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); - sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); -@@ -744,6 +747,7 @@ static struct attribute *bch_cache_set_internal_files[] = { +@@ -752,6 +770,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, @@ -318,38 +350,40 @@ index ed5e8a412eb8..03e533631798 100644 uint64_t bch_crc64(const void *, size_t); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index e58f9be5ae43..54add41d2569 100644 +index 0ec23987f098..d806fc6d2a6e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c -@@ -93,8 +93,11 @@ static void update_writeback_rate(struct work_struct *work) +@@ -100,7 +100,14 @@ static void update_writeback_rate(struct work_struct *work) + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, writeback_rate_update); - struct cache_set *c = dc->disk.c; - -- /* quit directly if cache set is stopping */ -- if (test_bit(CACHE_SET_STOPPING, &c->flags)) +- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ struct cache_set *c = dc->disk.c; ++ + /* -+ * quit directly if cache set is stopping. c->io_disable -+ * can be set via sysfs, check it here too. ++ * CACHE_SET_IO_DISABLE might be set via sysfs interface, ++ * check it here too. + */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags) || c->io_disable) ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + cached_dev_put(dc); return; - - down_read(&dc->writeback_lock); -@@ -105,8 +108,11 @@ static void update_writeback_rate(struct work_struct *work) + } +@@ -113,7 +120,12 @@ static void update_writeback_rate(struct work_struct *work) up_read(&dc->writeback_lock); -- /* do not schedule delayed work if cache set is stopping */ -- if (test_bit(CACHE_SET_STOPPING, &c->flags)) +- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { + /* -+ * do not schedule delayed work if cache set is stopping, -+ * c->io_disable can be set via sysfs, check it here too. ++ * CACHE_SET_IO_DISABLE might be set via sysfs interface, ++ * check it here too. + */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags) || c->io_disable) ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + cached_dev_put(dc); return; - - schedule_delayed_work(&dc->writeback_rate_update, -@@ -217,7 +223,7 @@ static void write_dirty(struct closure *cl) + } +@@ -248,7 +260,7 @@ static void write_dirty(struct closure *cl) bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; @@ -357,8 +391,8 @@ index e58f9be5ae43..54add41d2569 100644 + closure_bio_submit(io->dc->disk.c, &io->bio, cl); } - continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); -@@ -240,7 +246,7 @@ static void read_dirty_submit(struct closure *cl) + atomic_set(&dc->writeback_sequence_next, next_sequence); +@@ -274,7 +286,7 @@ static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); @@ -367,27 +401,29 @@ index e58f9be5ae43..54add41d2569 100644 continue_at(cl, write_dirty, io->dc->writeback_write_wq); } -@@ -259,7 +265,7 @@ static void read_dirty(struct cached_dev *dc) - * mempools. - */ +@@ -300,7 +312,9 @@ static void read_dirty(struct cached_dev *dc) -- while (!kthread_should_stop()) { -+ while (!(kthread_should_stop() || dc->disk.c->io_disable)) { - - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) -@@ -269,7 +275,9 @@ static void read_dirty(struct cached_dev *dc) - - if (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50) -- while (!kthread_should_stop() && delay) -+ while (!kthread_should_stop() && -+ !dc->disk.c->io_disable && -+ delay) - delay = schedule_timeout_interruptible(delay); + next = bch_keybuf_next(&dc->writeback_keys); - dc->last_read = KEY_OFFSET(&w->key); -@@ -450,18 +458,19 @@ static bool refill_dirty(struct cached_dev *dc) +- while (!kthread_should_stop() && next) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && ++ next) { + size = 0; + nk = 0; + +@@ -397,7 +411,9 @@ static void read_dirty(struct cached_dev *dc) + } + } + +- while (!kthread_should_stop() && delay) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && ++ delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } +@@ -553,11 +569,13 @@ static bool refill_dirty(struct cached_dev *dc) static int bch_writeback_thread(void *arg) { struct cached_dev *dc = arg; @@ -397,37 +433,29 @@ index e58f9be5ae43..54add41d2569 100644 bch_ratelimit_reset(&dc->writeback_rate); - while (!kthread_should_stop()) { -+ while (!(kthread_should_stop() || c->io_disable)) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && +@@ -565,7 +583,8 @@ static int bch_writeback_thread(void *arg) !dc->writeback_running)) { up_write(&dc->writeback_lock); -- if (kthread_should_stop()) -+ if (kthread_should_stop() || c->io_disable) +- if (kthread_should_stop()) { ++ if (kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + set_current_state(TASK_RUNNING); break; + } +@@ -593,6 +612,7 @@ static int bch_writeback_thread(void *arg) - set_current_state(TASK_INTERRUPTIBLE); -@@ -485,8 +494,8 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index) { - unsigned delay = dc->writeback_delay * HZ; - -- while (delay && -- !kthread_should_stop() && -+ while (delay && !kthread_should_stop() && -+ !c->io_disable && + while (delay && + !kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); -@@ -494,6 +503,7 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; - cached_dev_put(dc); - - return 0; -- 2.15.1 diff --git a/for-next/v2-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch b/for-next/v2-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch deleted file mode 100644 index d7438ca..0000000 --- a/for-next/v2-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch +++ /dev/null @@ -1,122 +0,0 @@ -From 2da5b83720460c83d0f20d0771a0c955e60028e8 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 00:03:45 +0800 -Subject: [PATCH v2 06/13] bcache: stop dc->writeback_rate_update, - dc->writeback_thread earlier - -Delayed worker dc->writeback_rate_update and kernel thread -dc->writeback_thread reference cache set data structure in their routine, -Therefor, before they are stopped, cache set should not be release. Other- -wise, NULL pointer deference will be triggered. - -Currenly delayed worker dc->writeback_rate_update and kernel thread -dc->writeback_thread are stopped in cached_dev_free(). When cache set is -retiring by too many I/O errors, cached_dev_free() is called when refcount -of bcache device's closure (disk.cl) reaches 0. In most of cases, last -refcount of disk.cl is dropped in last line of cached_dev_detach_finish(). -But in cached_dev_detach_finish() before calling closure_put(&dc->disk.cl), -bcache_device_detach() is called, and inside bcache_device_detach() -refcount of cache_set->caching is dropped by closure_put(&d->c->caching). - -It is very probably this is the last refcount of this closure, so routine -cache_set_flush() will be called (it is set in __cache_set_unregister()), -and its parent closure cache_set->cl may also drop its last refcount and -cache_set_free() is called too. In cache_set_free() the last refcount of -cache_set->kobj is dropped and then bch_cache_set_release() is called. Now -in bch_cache_set_release(), the memory of struct cache_set is freeed. - -bch_cache_set_release() is called before cached_dev_free(), then there is a -time window after cache set memory freed and before dc->writeback_thread -and dc->writeback_rate_update stopped, if one of them is scheduled to run, -a NULL pointer deference will be triggered. - -This patch fixes the above problem by stopping dc->writeback_thread and -dc->writeback_rate_update earlier in bcache_device_detach() before calling -closure_put(&d->c->caching). Because cancel_delayed_work_sync() and -kthread_stop() are synchronized operations, we can make sure cache set -is available when the delayed work and kthread are stopping. - -Because cached_dev_free() can also be called by writing 1 to sysfs file -/sys/block/bcache<N>/bcache/stop, this code path may not call -bcache_device_detach() if d-c is NULL. So stopping dc->writeback_thread -and dc->writeback_rate_update in cached_dev_free() is still necessary. In -order to avoid stop them twice, dc->rate_update_canceled is added to -indicate dc->writeback_rate_update is canceled, and dc->writeback_thread -is set to NULL to indicate it is stopped. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 21 +++++++++++++++++++-- - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 21 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 83c569942bd0..395b87942a2f 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -322,6 +322,7 @@ struct cached_dev { - - struct bch_ratelimit writeback_rate; - struct delayed_work writeback_rate_update; -+ bool rate_update_canceled; - - /* - * Internal to the writeback code, so read_dirty() can keep track of -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 5401d2356aa3..8912be4165c5 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -696,8 +696,20 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, - - static void bcache_device_detach(struct bcache_device *d) - { -+ struct cached_dev *dc; -+ - lockdep_assert_held(&bch_register_lock); - -+ dc = container_of(d, struct cached_dev, disk); -+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) { -+ kthread_stop(dc->writeback_thread); -+ dc->writeback_thread = NULL; -+ } -+ if (!dc->rate_update_canceled) { -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+ dc->rate_update_canceled = true; -+ } -+ - if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { - struct uuid_entry *u = d->c->uuids + d->id; - -@@ -1071,9 +1083,14 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -- if (!IS_ERR_OR_NULL(dc->writeback_thread)) -+ if (!dc->rate_update_canceled) { -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+ dc->rate_update_canceled = true; -+ } -+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); -+ dc->writeback_thread = NULL; -+ } - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 745d9b2a326f..ab2ac3d72393 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -548,6 +548,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_i_term_inverse = 10000; - - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); -+ dc->rate_update_canceled = false; - } - - int bch_cached_dev_writeback_start(struct cached_dev *dc) --- -2.15.1 - diff --git a/for-next/v2-0010-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v2-0007-bcache-stop-all-attached-bcache-devices-for-a-ret.patch index 3bd6ae7..1973080 100644 --- a/for-next/v2-0010-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ b/for-next/v2-0007-bcache-stop-all-attached-bcache-devices-for-a-ret.patch @@ -1,7 +1,7 @@ -From 830c365da083a0ecc03bd9d8ae2a664b506bb1e8 Mon Sep 17 00:00:00 2001 +From c4baae1be516264b2d8c2991af67800126273080 Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 18:24:55 +0800 -Subject: [PATCH v2 10/13] bcache: stop all attached bcache devices for a +Date: Wed, 10 Jan 2018 00:26:32 +0800 +Subject: [PATCH v2 07/11] bcache: stop all attached bcache devices for a retired cache set When there are too many I/O errors on cache device, current bcache code @@ -32,15 +32,18 @@ application will report I/O errors due to disappeared bcache device, and operation people will know the cache device is broken or disconnected. Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 7aa76c3e3556..3ba98ca9db79 100644 +index 32b64090a056..539569419780 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -1459,6 +1459,14 @@ static void __cache_set_unregister(struct closure *cl) +@@ -1455,6 +1455,14 @@ static void __cache_set_unregister(struct closure *cl) dc = container_of(c->devices[i], struct cached_dev, disk); bch_cached_dev_detach(dc); @@ -50,7 +53,7 @@ index 7aa76c3e3556..3ba98ca9db79 100644 + * keep data consistency on cache and + * backing devices. + */ -+ if (c->io_disable) ++ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + bcache_device_stop(c->devices[i]); } else { bcache_device_stop(c->devices[i]); diff --git a/for-next/v2-0008-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v2-0008-bcache-fix-inaccurate-io-state-for-detached-bcach.patch new file mode 100644 index 0000000..6494836 --- /dev/null +++ b/for-next/v2-0008-bcache-fix-inaccurate-io-state-for-detached-bcach.patch @@ -0,0 +1,118 @@ +From 501d81f7f2abd840b7bf03248f82ee550b6a784e Mon Sep 17 00:00:00 2001 +From: Tang Junhui <tang.junhui@zte.com.cn> +Date: Tue, 9 Jan 2018 10:27:11 +0800 +Subject: [PATCH v2 08/11] bcache: fix inaccurate io state for detached bcache + devices + +When we run IO in a detached device, and run iostat to shows IO status, +normally it will show like bellow (Omitted some fields): +Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util +sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 +bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 +but after IO stopped, there are still very big avgqu-sz and %util +values as bellow: +Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util +bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 + +The reason for this issue is that, only generic_start_io_acct() called +and no generic_end_io_acct() called for detached device in +cached_dev_make_request(). See the code: +//start generic_start_io_acct() +generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); +if (cached_dev_get(dc)) { + //will callback generic_end_io_acct() +} +else { + //will not call generic_end_io_acct() +} + +This patch calls generic_end_io_acct() in the end of IO for detached +devices, so we can show IO state correctly. + +(Modified to use GFP_NOIO in kzalloc() by Coly Li) + +Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> +Reviewed-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 51 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 12f5b95b3734..ae50b1a0fd46 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) + continue_at(cl, cached_dev_bio_complete, NULL); + } + ++struct detached_dev_io_private { ++ struct bcache_device *d; ++ unsigned long start_time; ++ bio_end_io_t *bi_end_io; ++ void *bi_private; ++}; ++ ++static void detatched_dev_end_io(struct bio *bio) ++{ ++ struct detached_dev_io_private *ddip; ++ ++ ddip = bio->bi_private; ++ bio->bi_end_io = ddip->bi_end_io; ++ bio->bi_private = ddip->bi_private; ++ ++ generic_end_io_acct(ddip->d->disk->queue, ++ bio_data_dir(bio), ++ &ddip->d->disk->part0, ddip->start_time); ++ ++ kfree(ddip); ++ ++ bio->bi_end_io(bio); ++} ++ ++static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) ++{ ++ struct detached_dev_io_private *ddip; ++ struct cached_dev *dc = container_of(d, struct cached_dev, disk); ++ ++ /* ++ * no need to call closure_get(&dc->disk.cl), ++ * because upper layer had already opened bcache device, ++ * which would call closure_get(&dc->disk.cl) ++ */ ++ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ++ ddip->d = d; ++ ddip->start_time = jiffies; ++ ddip->bi_end_io = bio->bi_end_io; ++ ddip->bi_private = bio->bi_private; ++ bio->bi_end_io = detatched_dev_end_io; ++ bio->bi_private = ddip; ++ ++ if ((bio_op(bio) == REQ_OP_DISCARD) && ++ !blk_queue_discard(bdev_get_queue(dc->bdev))) ++ bio->bi_end_io(bio); ++ else ++ generic_make_request(bio); ++} ++ + /* Cached devices - read & write stuff */ + + static blk_qc_t cached_dev_make_request(struct request_queue *q, +@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + else + cached_dev_read(dc, s); + } +- } else { +- if ((bio_op(bio) == REQ_OP_DISCARD) && +- !blk_queue_discard(bdev_get_queue(dc->bdev))) +- bio_endio(bio); +- else +- generic_make_request(bio); +- } ++ } else ++ detached_dev_do_request(d, bio); + + return BLK_QC_T_NONE; + } +-- +2.15.1 + diff --git a/for-next/v2-0008-bcache-fix-misleading-error-message-in-bch_count_.patch b/for-next/v2-0008-bcache-fix-misleading-error-message-in-bch_count_.patch deleted file mode 100644 index a186b5c..0000000 --- a/for-next/v2-0008-bcache-fix-misleading-error-message-in-bch_count_.patch +++ /dev/null @@ -1,118 +0,0 @@ -From 80d7abeee0b81a7ee0e3789bac9580f540437d0e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 15:59:33 +0800 -Subject: [PATCH v2 08/13] bcache: fix misleading error message in - bch_count_io_errors() - -Bcache only does recoverable I/O for read operations by calling -cached_dev_read_error(). For write opertions there is no I/O recovery for -failed requests. - -But in bch_count_io_errors() no matter read or write I/Os, before errors -counter reaches io error limit, pr_err() always prints "IO error on %, -recoverying". For write requests this information is misleading, because -there is no I/O recovery at all. - -This patch adds a parameter 'is_read' to bch_count_io_errors(), and only -prints "recovering" by pr_err() when the bio direction is READ. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 2 +- - drivers/md/bcache/io.c | 13 +++++++++---- - drivers/md/bcache/super.c | 4 +++- - drivers/md/bcache/writeback.c | 4 +++- - 4 files changed, 16 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index a31dc3737dae..c53f312b2216 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -855,7 +855,7 @@ static inline void wake_up_allocators(struct cache_set *c) - - /* Forward declarations */ - --void bch_count_io_errors(struct cache *, blk_status_t, const char *); -+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); - void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index fac97ec2d0e2..a783c5a41ff1 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - - /* IO errors */ - --void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) -+void bch_count_io_errors(struct cache *ca, -+ blk_status_t error, -+ int is_read, -+ const char *m) - { - /* - * The halflife of an error is: -@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) - errors >>= IO_ERROR_SHIFT; - - if (errors < ca->set->error_limit) -- pr_err("%s: IO error on %s, recovering", -- bdevname(ca->bdev, buf), m); -+ pr_err("%s: IO error on %s%s", -+ bdevname(ca->bdev, buf), m, -+ is_read ? ", recovering." : "."); - else - bch_cache_set_error(ca->set, - "%s: too many IO errors %s", -@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - { - struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); -+ int is_read = (bio_data_dir(bio) == READ ? 1 : 0); - - unsigned threshold = op_is_write(bio_op(bio)) - ? c->congested_write_threshold_us -@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - atomic_inc(&c->congested); - } - -- bch_count_io_errors(ca, error, m); -+ bch_count_io_errors(ca, error, is_read, m); - } - - void bch_bbio_endio(struct cache_set *c, struct bio *bio, -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 02d9d7110769..bbe911847eea 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio) - { - struct cache *ca = bio->bi_private; - -- bch_count_io_errors(ca, bio->bi_status, "writing superblock"); -+ /* is_read = 0 */ -+ bch_count_io_errors(ca, bio->bi_status, 0, -+ "writing superblock"); - closure_put(&ca->set->sb_write); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index ab2ac3d72393..e58f9be5ae43 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -228,8 +228,10 @@ static void read_dirty_endio(struct bio *bio) - struct keybuf_key *w = bio->bi_private; - struct dirty_io *io = w->private; - -+ /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), -- bio->bi_status, "reading dirty data from cache"); -+ bio->bi_status, 1, -+ "reading dirty data from cache"); - - dirty_endio(bio); - } --- -2.15.1 - diff --git a/for-next/v2-0011-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v2-0009-bcache-add-backing_request_endio-for-bi_end_io-of.patch index 6538f6d..19042ab 100644 --- a/for-next/v2-0011-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ b/for-next/v2-0009-bcache-add-backing_request_endio-for-bi_end_io-of.patch @@ -1,8 +1,8 @@ -From 9de59f63ca4328e4b8aefca35a1f6a565d14579c Mon Sep 17 00:00:00 2001 +From 8f4cbbd1bd89675b94ff34790c1a13b0547784d1 Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Thu, 4 Jan 2018 14:38:54 +0800 -Subject: [PATCH v2 11/13] bcache: add backing_request_endio() for bi_end_io of - backing device I/O +Date: Wed, 10 Jan 2018 21:01:48 +0800 +Subject: [PATCH v2 09/11] bcache: add backing_request_endio() for bi_end_io of + attached backing device I/O In order to catch I/O error of backing device, a separate bi_end_io call back is required. Then a per backing device counter can record I/O @@ -15,6 +15,8 @@ handling. So far there is no real code logic change, I make this change a separate patch to make sure it is stable and reliable for further work. Signed-off-by: Coly Li <colyli@suse.de> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> --- drivers/md/bcache/request.c | 94 +++++++++++++++++++++++++++++++++++-------- drivers/md/bcache/super.c | 1 + @@ -22,7 +24,7 @@ Signed-off-by: Coly Li <colyli@suse.de> 3 files changed, 80 insertions(+), 16 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index a85d6a605a8e..81fe5b977ffd 100644 +index ae50b1a0fd46..2d843a0a3ed8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) @@ -33,7 +35,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 bio_put(bio); out: continue_at(cl, bch_data_insert_keys, op->wq); -@@ -608,6 +609,37 @@ static void request_endio(struct bio *bio) +@@ -630,6 +631,37 @@ static void request_endio(struct bio *bio) closure_put(cl); } @@ -71,7 +73,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 static void bio_complete(struct search *s) { if (s->orig_bio) { -@@ -622,13 +654,21 @@ static void bio_complete(struct search *s) +@@ -644,13 +676,21 @@ static void bio_complete(struct search *s) } } @@ -95,7 +97,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 bio->bi_private = &s->cl; bio_cnt_set(bio, 3); -@@ -654,7 +694,7 @@ static inline struct search *search_alloc(struct bio *bio, +@@ -676,7 +716,7 @@ static inline struct search *search_alloc(struct bio *bio, s = mempool_alloc(d->c->search, GFP_NOIO); closure_init(&s->cl, NULL); @@ -104,7 +106,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 s->orig_bio = bio; s->cache_miss = NULL; -@@ -721,10 +761,11 @@ static void cached_dev_read_error(struct closure *cl) +@@ -743,10 +783,11 @@ static void cached_dev_read_error(struct closure *cl) trace_bcache_read_retry(s->orig_bio); s->iop.status = 0; @@ -117,7 +119,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 closure_bio_submit(s->iop.c, bio, cl); } -@@ -837,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, +@@ -859,7 +900,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; @@ -126,7 +128,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); -@@ -850,14 +891,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, +@@ -872,14 +913,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); @@ -144,7 +146,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 closure_bio_submit(s->iop.c, miss, &s->cl); return ret; } -@@ -921,31 +964,48 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) +@@ -943,31 +986,48 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bio = s->orig_bio; bio_get(s->iop.bio); @@ -203,7 +205,7 @@ index a85d6a605a8e..81fe5b977ffd 100644 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } -@@ -959,6 +1019,7 @@ static void cached_dev_nodata(struct closure *cl) +@@ -981,6 +1041,7 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ @@ -211,16 +213,16 @@ index a85d6a605a8e..81fe5b977ffd 100644 closure_bio_submit(s->iop.c, bio, cl); continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1010,6 +1071,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - !blk_queue_discard(bdev_get_queue(dc->bdev))) - bio_endio(bio); - else -+ /* I/O request sent to backing device */ - generic_make_request(bio); - } +@@ -1078,6 +1139,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + cached_dev_read(dc, s); + } + } else ++ /* I/O request sent to backing device */ + detached_dev_do_request(d, bio); + return BLK_QC_T_NONE; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 3ba98ca9db79..3c996d2d1b85 100644 +index 539569419780..247b87020f1f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) @@ -232,10 +234,10 @@ index 3ba98ca9db79..3c996d2d1b85 100644 closure_return_with_destructor(cl, bch_write_bdev_super_unlock); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 54add41d2569..b4888458cc29 100644 +index d806fc6d2a6e..f668a8efc389 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c -@@ -223,6 +223,7 @@ static void write_dirty(struct closure *cl) +@@ -260,6 +260,7 @@ static void write_dirty(struct closure *cl) bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; diff --git a/for-next/v2-0012-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v2-0010-bcache-add-io_disable-to-struct-cached_dev.patch index 448faf2..2d50d60 100644 --- a/for-next/v2-0012-bcache-add-io_disable-to-struct-cached_dev.patch +++ b/for-next/v2-0010-bcache-add-io_disable-to-struct-cached_dev.patch @@ -1,7 +1,7 @@ -From 9b3fb44af80d00453efc8b2224008426f40f6351 Mon Sep 17 00:00:00 2001 +From 5b361eab149024c208c95b98b2dc2c2639b53ab5 Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@suse.de> -Date: Fri, 5 Jan 2018 14:12:32 +0800 -Subject: [PATCH v2 12/13] bcache: add io_disable to struct cached_dev +Date: Wed, 10 Jan 2018 21:33:45 +0800 +Subject: [PATCH v2 10/11] bcache: add io_disable to struct cached_dev If a bcache device is configured to writeback mode, current code does not handle write I/O errors on backing devices properly. @@ -25,19 +25,22 @@ reach its error limit, backing device will be disabled and the associated bcache device will be removed from system. Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.com> +Cc: Huijun Tang <tang.junhui@zte.com.cn> --- drivers/md/bcache/bcache.h | 7 +++++++ drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 6 +++++- + drivers/md/bcache/request.c | 15 +++++++++++++-- drivers/md/bcache/super.c | 23 +++++++++++++++++++++++ - drivers/md/bcache/sysfs.c | 9 ++++++++- - 5 files changed, 57 insertions(+), 2 deletions(-) + drivers/md/bcache/sysfs.c | 14 +++++++++++++- + 5 files changed, 70 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 9c7f9b1cb791..653ea664de55 100644 +index ab51984d12f6..27e0c2f56751 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h -@@ -351,6 +351,7 @@ struct cached_dev { +@@ -359,6 +359,7 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; @@ -45,7 +48,7 @@ index 9c7f9b1cb791..653ea664de55 100644 unsigned verify:1; unsigned bypass_torture_test:1; -@@ -370,6 +371,10 @@ struct cached_dev { +@@ -378,6 +379,10 @@ struct cached_dev { unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; unsigned writeback_rate_minimum; @@ -56,7 +59,7 @@ index 9c7f9b1cb791..653ea664de55 100644 }; enum alloc_reserve { -@@ -869,6 +874,7 @@ static inline void closure_bio_submit(struct cache_set *c, +@@ -881,6 +886,7 @@ static inline void closure_bio_submit(struct cache_set *c, /* Forward declarations */ @@ -64,7 +67,7 @@ index 9c7f9b1cb791..653ea664de55 100644 void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, blk_status_t, const char *); -@@ -896,6 +902,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, +@@ -908,6 +914,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, unsigned, unsigned, bool); @@ -98,10 +101,10 @@ index 8013ecbcdbda..7fac97ae036e 100644 void bch_count_io_errors(struct cache *ca, blk_status_t error, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 81fe5b977ffd..36aac569275d 100644 +index 2d843a0a3ed8..ea8632515f7f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c -@@ -615,6 +615,8 @@ static void backing_request_endio(struct bio *bio) +@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); @@ -110,7 +113,7 @@ index 81fe5b977ffd..36aac569275d 100644 /* * If a bio has REQ_PREFLUSH for writeback mode, it is * speically assembled in cached_dev_write() for a non-zero -@@ -634,6 +636,7 @@ static void backing_request_endio(struct bio *bio) +@@ -656,6 +658,7 @@ static void backing_request_endio(struct bio *bio) } s->recoverable = false; /* should count I/O error for backing device here */ @@ -118,21 +121,38 @@ index 81fe5b977ffd..36aac569275d 100644 } bio_put(bio); -@@ -1035,7 +1038,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, +@@ -1066,8 +1069,14 @@ static void detatched_dev_end_io(struct bio *bio) + bio_data_dir(bio), + &ddip->d->disk->part0, ddip->start_time); + +- kfree(ddip); ++ if (bio->bi_status) { ++ struct cached_dev *dc = container_of(ddip->d, ++ struct cached_dev, disk); ++ /* should count I/O error for backing device here */ ++ bch_count_backing_io_errors(dc, bio); ++ } + ++ kfree(ddip); + bio->bi_end_io(bio); + } + +@@ -1106,7 +1115,9 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); -- if (unlikely(d->c && d->c->io_disable)) { -+ if (unlikely((d->c && d->c->io_disable) || -+ dc->io_disable)) { +- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { ++ if (unlikely(d->c && ++ test_bit(CACHE_SET_IO_DISABLE, &d->c->flags) && ++ dc->io_disable)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 3c996d2d1b85..c7a37045beaa 100644 +index 247b87020f1f..7e9acad46e65 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -1168,6 +1168,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) +@@ -1165,6 +1165,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); @@ -143,7 +163,7 @@ index 3c996d2d1b85..c7a37045beaa 100644 bch_cached_dev_request_init(dc); bch_cached_dev_writeback_init(dc); return 0; -@@ -1319,6 +1323,25 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) +@@ -1316,6 +1320,25 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) return flash_dev_run(c, u); } @@ -170,7 +190,7 @@ index 3c996d2d1b85..c7a37045beaa 100644 __printf(2, 3) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index acce7c82e111..2308c5609c01 100644 +index fd04450b5f13..1a28a203e4fc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -131,7 +131,9 @@ SHOW(__bch_cached_dev) @@ -184,16 +204,21 @@ index acce7c82e111..2308c5609c01 100644 var_print(writeback_rate_update_seconds); var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); -@@ -220,6 +222,8 @@ STORE(__cached_dev) +@@ -220,6 +222,13 @@ STORE(__cached_dev) d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); + sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ sysfs_strtoul_clamp(io_disable, dc->io_disable, 0, 1); ++ ++ if (attr == &sysfs_io_disable) { ++ int v = strtoul_or_return(buf); ++ dc->io_disable = v ? 1 : 0; ++ } ++ d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); -@@ -326,6 +330,9 @@ static struct attribute *bch_cached_dev_files[] = { +@@ -334,6 +343,9 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, diff --git a/for-next/v2-0011-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v2-0011-bcache-stop-bcache-device-when-backing-device-is-.patch new file mode 100644 index 0000000..3006a04 --- /dev/null +++ b/for-next/v2-0011-bcache-stop-bcache-device-when-backing-device-is-.patch @@ -0,0 +1,148 @@ +From 6e68ee1e93a5d5e018630e80fa14009b2f676f0f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 10 Jan 2018 23:51:12 +0800 +Subject: [PATCH v2 11/11] bcache: stop bcache device when backing device is + offline + +Currently bcache does not handle backing device failure, if backing +device is offline and disconnected from system, its bcache device can still +be accessible. If the bcache device is in writeback mode, I/O requests even +can success if the requests hit on cache device. That is to say, when and +how bcache handles offline backing device is undefined. + +This patch tries to handle backing device offline in a rather simple way, +- Add cached_dev->status_update_thread kernel thread to update backing + device status in every 1 second. +- Add cached_dev->offline_seconds to record how many seconds the backing + device is observed to be offline. If the backing device is offline for + BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and + call bcache_device_stop() to stop the bache device which linked to the + offline backing device. + +Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, +its bcache device will be removed, then user space application writing on +it will get error immediately, and handler the device failure in time. + +This patch is quite simple, does not handle more complicated situations. +Once the bcache device is stopped, users need to recovery the backing +device, register and attach it manually. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.com> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/bcache.h | 2 ++ + drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 57 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 27e0c2f56751..f666d4317624 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -337,6 +337,7 @@ struct cached_dev { + + struct keybuf writeback_keys; + ++ struct task_struct *status_update_thread; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of +@@ -383,6 +384,7 @@ struct cached_dev { + #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned error_limit; ++ unsigned offline_seconds; + }; + + enum alloc_reserve { +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 7e9acad46e65..35ada04f0185 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, + unsigned int cmd, unsigned long arg) + { + struct bcache_device *d = b->bd_disk->private_data; ++ struct cached_dev *dc = container_of(d, struct cached_dev, disk); ++ ++ if (dc->io_disable) ++ return -EIO; ++ + return d->ioctl(d, mode, cmd, arg); + } + +@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) + c->cached_dev_sectors = sectors; + } + ++#define BACKING_DEV_OFFLINE_TIMEOUT 30 ++static int cached_dev_status_update(void *arg) ++{ ++ struct cached_dev *dc = arg; ++ struct request_queue *q; ++ char buf[BDEVNAME_SIZE]; ++ ++ /* ++ * If this delayed worker is stopping outside, directly quit here. ++ * dc->io_disable might be set via sysfs interface, so check it ++ * here too. ++ */ ++ while (!kthread_should_stop() && !dc->io_disable) { ++ q = bdev_get_queue(dc->bdev); ++ if (blk_queue_dying(q)) ++ dc->offline_seconds ++; ++ else ++ dc->offline_seconds = 0; ++ ++ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { ++ pr_err("%s: device offline for %d seconds", ++ bdevname(dc->bdev, buf), ++ BACKING_DEV_OFFLINE_TIMEOUT); ++ pr_err("%s: disable I/O request due to backing " ++ "device offline", dc->disk.name); ++ dc->io_disable = true; ++ /* let others know earlier that io_disable is true */ ++ smp_mb(); ++ bcache_device_stop(&dc->disk); ++ break; ++ } ++ ++ schedule_timeout_interruptible(HZ); ++ } ++ ++ dc->status_update_thread = NULL; ++ return 0; ++} ++ + void bch_cached_dev_run(struct cached_dev *dc) + { + struct bcache_device *d = &dc->disk; +@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc) + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + pr_debug("error creating sysfs link"); ++ ++ dc->status_update_thread = kthread_create(cached_dev_status_update, ++ dc, ++ "bcache_status_update"); ++ if (IS_ERR(dc->status_update_thread)) { ++ pr_warn("bcache: failed to create bcache_status_update " ++ "kthread, continue to run without monitoring backing " ++ "device status"); ++ } + } + + static void cached_dev_detach_finish(struct work_struct *w) +@@ -1093,6 +1146,8 @@ static void cached_dev_free(struct closure *cl) + kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); ++ if (!IS_ERR_OR_NULL(dc->status_update_thread)) ++ kthread_stop(dc->status_update_thread); + + mutex_lock(&bch_register_lock); + +-- +2.15.1 + diff --git a/for-next/v2-0013-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v2-0013-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index bdbd112..0000000 --- a/for-next/v2-0013-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 16d7b83245059ce3675b0b53363444cb0988a261 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 09:56:33 +0800 -Subject: [PATCH v2 13/13] bcache: stop bcache device when backing device is - offline - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 39 ++++++++++++++++++++++++++++++++++++++- - 2 files changed, 40 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 653ea664de55..22d98ba0512b 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -375,6 +375,8 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; -+ struct delayed_work device_status_update; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index c7a37045beaa..89cbbd83c79e 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -1090,6 +1095,7 @@ static void cached_dev_free(struct closure *cl) - cancel_delayed_work_sync(&dc->writeback_rate_update); - dc->rate_update_canceled = true; - } -+ cancel_delayed_work_sync(&dc->device_status_update); - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -1129,6 +1135,37 @@ static void cached_dev_flush(struct closure *cl) - continue_at(cl, cached_dev_free, system_wq); - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 30 -+static void cached_dev_status_update(struct work_struct *work) -+{ -+ struct cached_dev *dc; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ dc = container_of(to_delayed_work(work), -+ struct cached_dev, -+ device_status_update); -+ q = bdev_get_queue(dc->bdev); -+ -+ if (blk_queue_dying(q)) -+ dc->offline_seconds ++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds > BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing device offline", -+ dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ return; -+ } -+ /* polling backing device connection status every second */ -+ schedule_delayed_work(&dc->device_status_update, HZ); -+} -+ - static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - { - int ret; -@@ -1171,7 +1208,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - atomic_set(&dc->io_errors, 0); - dc->io_disable = false; - dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -- -+ INIT_DELAYED_WORK(&dc->device_status_update, cached_dev_status_update); - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; --- -2.15.1 - |