diff options
author | Coly Li <colyli@suse.de> | 2018-04-30 14:20:24 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2018-04-30 14:20:24 +0800 |
commit | 8cf7259febfc5df4c167ffb04b67874d807a48d6 (patch) | |
tree | f331fa08fc552b500699c0f8a6f16ebfd7335746 | |
parent | 3e2132d3eb52cfeedbe4f8dd9a557a6df385542a (diff) | |
download | bcache-patches-8cf7259febfc5df4c167ffb04b67874d807a48d6.tar.gz |
remove bcache device failure patche set v1/2/3/4/5/6
73 files changed, 0 insertions, 12722 deletions
diff --git a/for-next/v1/v1-0000-cover-letter.patch b/for-next/v1/v1-0000-cover-letter.patch deleted file mode 100644 index 0ac36de..0000000 --- a/for-next/v1/v1-0000-cover-letter.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 595d5d28a7ed23cae061b9e0dd201611afd6db6d Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 21:20:57 +0800 -Subject: [PATCH v1 00/10] cache device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve cache device failure handling. A basic -idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices attached to this cache set -- Stop all bcache devices linked to this cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. - -The first 8 patches of this patch set is to fix existing bugs in bcache, -the last 2 patches do the real improvement. Order of applying these patches -is important, if the last 2 patches are applied firstly, kernel panic or -process hang will be observed. Therefore I suggest to apply the first 8 -fixes, then apply the last 2 patches. - -The patch set is tested with writethrough, writeback, writearound mode, -read/write/readwrite workloads, so far it works as expected. IMHO the -cache set retire logic is complicated, I need your help to review the -patches, any question is warmly wlecome. - -Coly Li (10): - bcache: exit bch_writeback_thread() with proper task state - bcache: set task properly in allocator_wait() - bcache: reduce cache_set devices iteration by devices_max_used - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: stop dc->writeback_rate_update if cache set is stopping - bcache: stop dc->writeback_rate_update, dc->writeback_thread earlier - bcache: set error_limit correctly - bcache: fix misleading error message in bch_count_io_errors() - bcache: add io_disable to struct cache_set - bcache: stop all attached bcache devices for a retired cache set - - drivers/md/bcache/alloc.c | 5 ++--- - drivers/md/bcache/bcache.h | 19 +++++++++++++++- - drivers/md/bcache/btree.c | 8 ++++--- - drivers/md/bcache/io.c | 15 ++++++++----- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 ++++++++++++++++------ - drivers/md/bcache/super.c | 51 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/sysfs.c | 8 +++++-- - drivers/md/bcache/util.h | 6 ----- - drivers/md/bcache/writeback.c | 51 +++++++++++++++++++++++++++++++++---------- - drivers/md/bcache/writeback.h | 4 +--- - 11 files changed, 144 insertions(+), 53 deletions(-) - -Thanks in advance. - -Coly Li diff --git a/for-next/v1/v1-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch b/for-next/v1/v1-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch deleted file mode 100644 index 1ce1bfb..0000000 --- a/for-next/v1/v1-0001-bcache-exit-bch_writeback_thread-with-proper-task.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 02cd6111e6e305665b9b734b41d9e66735eefba5 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 20:32:58 +0800 -Subject: [PATCH v1 01/10] bcache: exit bch_writeback_thread() with proper task - state - -Kernel thread routine bch_writeback_thread() has the following code block, - -452 set_current_state(TASK_INTERRUPTIBLE); -453 -454 if (kthread_should_stop()) -455 return 0; -456 -457 schedule(); -458 continue; - -At line 452, its status is set to TASK_INTERRUPTIBLE, and at line 454 if -kthread_should_stop() is true, a "return 0" at line 455 will to function -kernel/kthread.c:kthread() and call do_exit(). - -It is not good to enter do_exit() with task state TASK_INTERRUPTIBLE, in -following code path might_sleep() is called and a warning message is -reported by __might_sleep(): "WARNING: do not call blocking ops when -!TASK_RUNNING; state=1 set at [xxxx]". - -Indeed it does not hurt when kernel thread exits with TASK_INTERRUPTIBLE -state, but this warning message scares users, makes them feel there might -be something risky with bcache and hurt their data. - -In this patch, TASK_INTERRUPTIBLE is set after kthread_should_stop(), -so writeback kernel thread can exist and enter do_exit() with -TASK_RUNNING state. Warning message from might_sleep() is removed. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/writeback.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 56a37884ca8b..a57149803df6 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -449,11 +449,11 @@ static int bch_writeback_thread(void *arg) - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); -- set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - return 0; - -+ set_current_state(TASK_INTERRUPTIBLE); - schedule(); - continue; - } --- -2.15.1 - diff --git a/for-next/v1/v1-0002-bcache-set-task-properly-in-allocator_wait.patch b/for-next/v1/v1-0002-bcache-set-task-properly-in-allocator_wait.patch deleted file mode 100644 index a9b6799..0000000 --- a/for-next/v1/v1-0002-bcache-set-task-properly-in-allocator_wait.patch +++ /dev/null @@ -1,79 +0,0 @@ -From 9eb34cfed6f7cf086a31d0e01f79548aaa82eab9 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 22:37:11 +0800 -Subject: [PATCH v1 02/10] bcache: set task properly in allocator_wait() - -Kernel thread routine bch_allocator_thread() references macro -allocator_wait() to wait for a condition or quit to do_exit() -when kthread_should_stop() is true. - -Macro allocator_wait() has 2 issues in setting task state, let's -see its code piece, - -284 while (1) { \ -285 set_current_state(TASK_INTERRUPTIBLE); \ -286 if (cond) \ -287 break; \ -288 \ -289 mutex_unlock(&(ca)->set->bucket_lock); \ -290 if (kthread_should_stop()) \ -291 return 0; \ -292 \ -293 schedule(); \ -294 mutex_lock(&(ca)->set->bucket_lock); \ -295 } \ -296 __set_current_state(TASK_RUNNING); \ - -1) At line 285, task state is set to TASK_INTERRUPTIBLE, if at line 290 -kthread_should_stop() is true, the kernel thread will terminate and return -to kernel/kthread.s:kthread(), then calls do_exit() with TASK_INTERRUPTIBLE -state. This is not a suggested behavior and a warning message will be -reported by might_sleep() in do_exit() code path: "WARNING: do not call -blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -2) Because task state is set to TASK_INTERRUPTIBLE at line 285, when break -while-loop the task state has to be set back to TASK_RUNNING at line 296. -Indeed it is unncessary, if task state is set to TASK_INTERRUPTIBLE before -calling schedule() at line 293, we don't need to set the state back to -TASK_RUNNING at line 296 anymore. The reason is, allocator kthread is only -woken up by wake_up_process(), this routine makes sure the task state of -allocator kthread will be TASK_RUNNING after it returns from schedule() at -line 294 (see kernel/sched/core.c:try_to_wake_up() for more detailed -information). - -This patch fixes the above 2 issues by, -1) Setting TASK_INTERRUPTIBLE state just before calling schedule(). -2) Then setting TASK_RUNNING at line 296 is unnecessary, remove it. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/alloc.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index a0cc1bc6d884..48c002faf08d 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -282,7 +282,6 @@ static void invalidate_buckets(struct cache *ca) - #define allocator_wait(ca, cond) \ - do { \ - while (1) { \ -- set_current_state(TASK_INTERRUPTIBLE); \ - if (cond) \ - break; \ - \ -@@ -290,10 +289,10 @@ do { \ - if (kthread_should_stop()) \ - return 0; \ - \ -+ set_current_state(TASK_INTERRUPTIBLE); \ - schedule(); \ - mutex_lock(&(ca)->set->bucket_lock); \ - } \ -- __set_current_state(TASK_RUNNING); \ - } while (0) - - static int bch_allocator_push(struct cache *ca, long bucket) --- -2.15.1 - diff --git a/for-next/v1/v1-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch b/for-next/v1/v1-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch deleted file mode 100644 index 8cbf66c..0000000 --- a/for-next/v1/v1-0003-bcache-reduce-cache_set-devices-iteration-by-devi.patch +++ /dev/null @@ -1,119 +0,0 @@ -From fd33195d255d0f152d9e2b36032b1cc816ededb3 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Dec 2017 23:27:41 +0800 -Subject: [PATCH v1 03/10] bcache: reduce cache_set devices iteration by - devices_max_used - -Member devices of struct cache_set is used to reference all attached -bcache devices to this cache set. If it is treated as array of pointers, -size of devices[] is indicated by member nr_uuids of struct cache_set. - -nr_uuids is calculated in drivers/md/super.c:bch_cache_set_alloc(), - bucket_bytes(c) / sizeof(struct uuid_entry) -Bucket size is determined by user space tool "make-bcache", by default it -is 1024 sectors (defined in bcache-tools/make-bcache.c:main()). So default -nr_uuids value is 4096 from the above calculation. - -Every time when bcache code iterates bcache devices of a cache set, all -the 4096 pointers are checked even only 1 bcache device is attached to the -cache set, that's a wast of time and unncessary. - -This patch adds a member devices_max_used to struct cache_set. Its value -is 1 + the maximum used index of devices[] in a cache set. When iterating -all valid bcache devices of a cache set, use c->devices_max_used in -for-loop may reduce a lot of useless checking. - -Personally, my motivation of this patch is not for performance, I use it -in bcache debugging, which helps me to narrow down the scape to check -valid bcached devices of a cache set. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/btree.c | 2 +- - drivers/md/bcache/super.c | 9 ++++++--- - drivers/md/bcache/writeback.h | 2 +- - 4 files changed, 9 insertions(+), 5 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 843877e017e1..83c569942bd0 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -488,6 +488,7 @@ struct cache_set { - int caches_loaded; - - struct bcache_device **devices; -+ unsigned devices_max_used; - struct list_head cached_devs; - uint64_t cached_dev_sectors; - struct closure caching; -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 81e8dc3dbe5e..bf0d7978bc3d 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1678,7 +1678,7 @@ static void bch_btree_gc_finish(struct cache_set *c) - - /* don't reclaim buckets to which writeback keys point */ - rcu_read_lock(); -- for (i = 0; i < c->nr_uuids; i++) { -+ for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - struct cached_dev *dc; - struct keybuf_key *w, *n; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index b4d28928dec5..064efd869017 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -721,6 +721,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, - d->c = c; - c->devices[id] = d; - -+ if (id >= c->devices_max_used) -+ c->devices_max_used = id + 1; -+ - closure_get(&c->caching); - } - -@@ -1261,7 +1264,7 @@ static int flash_devs_run(struct cache_set *c) - struct uuid_entry *u; - - for (u = c->uuids; -- u < c->uuids + c->nr_uuids && !ret; -+ u < c->uuids + c->devices_max_used && !ret; - u++) - if (UUID_FLASH_ONLY(u)) - ret = flash_dev_run(c, u); -@@ -1427,7 +1430,7 @@ static void __cache_set_unregister(struct closure *cl) - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->nr_uuids; i++) -+ for (i = 0; i < c->devices_max_used; i++) - if (c->devices[i]) { - if (!UUID_FLASH_ONLY(&c->uuids[i]) && - test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -@@ -1490,7 +1493,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->bucket_bits = ilog2(sb->bucket_size); - c->block_bits = ilog2(sb->block_size); - c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); -- -+ c->devices_max_used = 0; - c->btree_pages = bucket_pages(c); - if (c->btree_pages > BTREE_MAX_PAGES) - c->btree_pages = max_t(int, c->btree_pages / 4, -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index a9e3ffb4b03c..1d284f3d0363 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -21,7 +21,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->nr_uuids; i++) { -+ for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - - if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) --- -2.15.1 - diff --git a/for-next/v1/v1-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v1/v1-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index e3975b2..0000000 --- a/for-next/v1/v1-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,171 +0,0 @@ -From d697858f6f515b4bacee984c82535cf2b896ace9 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 22 Dec 2017 16:37:17 +0800 -Subject: [PATCH v1 04/10] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - [code snip] - - + if (atomic_read(&dc->has_dirty)) - + cached_dev_put() - + - return 0; - [code snip] - -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 10 +++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 7 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 064efd869017..5401d2356aa3 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1044,7 +1044,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index a57149803df6..0789a9e18337 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -451,7 +451,7 @@ static int bch_writeback_thread(void *arg) - up_write(&dc->writeback_lock); - - if (kthread_should_stop()) -- return 0; -+ break; - - set_current_state(TASK_INTERRUPTIBLE); - schedule(); -@@ -463,7 +463,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -484,6 +483,8 @@ static int bch_writeback_thread(void *arg) - } - } - -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -547,10 +548,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 1d284f3d0363..aab21afe49cf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -92,8 +92,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.15.1 - diff --git a/for-next/v1/v1-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch b/for-next/v1/v1-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch deleted file mode 100644 index d3e78e8..0000000 --- a/for-next/v1/v1-0005-bcache-stop-dc-writeback_rate_update-if-cache-set.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 1a9aae02c180b47b2ae2ef9c61915b2b694d1fc2 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 23 Dec 2017 01:50:19 +0800 -Subject: [PATCH v1 05/10] bcache: stop dc->writeback_rate_update if cache set - is stopping - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, cached_dev_free() -calls cancel_delayed_work_sync(&dc->writeback_rate_update) to stop this -delayed work. - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -after a piece of time. That means when cancel_delayed_work_sync() returns, -this delayed work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will eventually release memory of struct cache set. -Then the delayed work is scheduled to run, and inside its routine -update_writeback_rate() that already released cache set NULL pointer will -be accessed. Now a NULL pointer deference panic is triggered. - -In order to avoid the above problem, this patch checks cache set flags in -delayed work routine update_writeback_rate(). If flag CACHE_SET_STOPPING -is set, this routine will quit without re-arm the delayed work. Then the -NULL pointer deference panic won't happen after cache set is released. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/writeback.c | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 0789a9e18337..745d9b2a326f 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -91,6 +91,11 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; -+ -+ /* quit directly if cache set is stopping */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ return; - - down_read(&dc->writeback_lock); - -@@ -100,6 +105,10 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -+ /* do not schedule delayed work if cache set is stopping */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ return; -+ - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } --- -2.15.1 - diff --git a/for-next/v1/v1-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch b/for-next/v1/v1-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch deleted file mode 100644 index 53ce3f2..0000000 --- a/for-next/v1/v1-0006-bcache-stop-dc-writeback_rate_update-dc-writeback.patch +++ /dev/null @@ -1,122 +0,0 @@ -From 2da5b83720460c83d0f20d0771a0c955e60028e8 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 00:03:45 +0800 -Subject: [PATCH v1 06/10] bcache: stop dc->writeback_rate_update, - dc->writeback_thread earlier - -Delayed worker dc->writeback_rate_update and kernel thread -dc->writeback_thread reference cache set data structure in their routine, -Therefor, before they are stopped, cache set should not be release. Other- -wise, NULL pointer deference will be triggered. - -Currenly delayed worker dc->writeback_rate_update and kernel thread -dc->writeback_thread are stopped in cached_dev_free(). When cache set is -retiring by too many I/O errors, cached_dev_free() is called when refcount -of bcache device's closure (disk.cl) reaches 0. In most of cases, last -refcount of disk.cl is dropped in last line of cached_dev_detach_finish(). -But in cached_dev_detach_finish() before calling closure_put(&dc->disk.cl), -bcache_device_detach() is called, and inside bcache_device_detach() -refcount of cache_set->caching is dropped by closure_put(&d->c->caching). - -It is very probably this is the last refcount of this closure, so routine -cache_set_flush() will be called (it is set in __cache_set_unregister()), -and its parent closure cache_set->cl may also drop its last refcount and -cache_set_free() is called too. In cache_set_free() the last refcount of -cache_set->kobj is dropped and then bch_cache_set_release() is called. Now -in bch_cache_set_release(), the memory of struct cache_set is freeed. - -bch_cache_set_release() is called before cached_dev_free(), then there is a -time window after cache set memory freed and before dc->writeback_thread -and dc->writeback_rate_update stopped, if one of them is scheduled to run, -a NULL pointer deference will be triggered. - -This patch fixes the above problem by stopping dc->writeback_thread and -dc->writeback_rate_update earlier in bcache_device_detach() before calling -closure_put(&d->c->caching). Because cancel_delayed_work_sync() and -kthread_stop() are synchronized operations, we can make sure cache set -is available when the delayed work and kthread are stopping. - -Because cached_dev_free() can also be called by writing 1 to sysfs file -/sys/block/bcache<N>/bcache/stop, this code path may not call -bcache_device_detach() if d-c is NULL. So stopping dc->writeback_thread -and dc->writeback_rate_update in cached_dev_free() is still necessary. In -order to avoid stop them twice, dc->rate_update_canceled is added to -indicate dc->writeback_rate_update is canceled, and dc->writeback_thread -is set to NULL to indicate it is stopped. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 21 +++++++++++++++++++-- - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 21 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 83c569942bd0..395b87942a2f 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -322,6 +322,7 @@ struct cached_dev { - - struct bch_ratelimit writeback_rate; - struct delayed_work writeback_rate_update; -+ bool rate_update_canceled; - - /* - * Internal to the writeback code, so read_dirty() can keep track of -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 5401d2356aa3..8912be4165c5 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -696,8 +696,20 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, - - static void bcache_device_detach(struct bcache_device *d) - { -+ struct cached_dev *dc; -+ - lockdep_assert_held(&bch_register_lock); - -+ dc = container_of(d, struct cached_dev, disk); -+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) { -+ kthread_stop(dc->writeback_thread); -+ dc->writeback_thread = NULL; -+ } -+ if (!dc->rate_update_canceled) { -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+ dc->rate_update_canceled = true; -+ } -+ - if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { - struct uuid_entry *u = d->c->uuids + d->id; - -@@ -1071,9 +1083,14 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -- if (!IS_ERR_OR_NULL(dc->writeback_thread)) -+ if (!dc->rate_update_canceled) { -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+ dc->rate_update_canceled = true; -+ } -+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); -+ dc->writeback_thread = NULL; -+ } - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 745d9b2a326f..ab2ac3d72393 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -548,6 +548,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_i_term_inverse = 10000; - - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); -+ dc->rate_update_canceled = false; - } - - int bch_cached_dev_writeback_start(struct cached_dev *dc) --- -2.15.1 - diff --git a/for-next/v1/v1-0007-bcache-set-error_limit-correctly.patch b/for-next/v1/v1-0007-bcache-set-error_limit-correctly.patch deleted file mode 100644 index 2d5b243..0000000 --- a/for-next/v1/v1-0007-bcache-set-error_limit-correctly.patch +++ /dev/null @@ -1,114 +0,0 @@ -From 93e9a82ee54b8fb1e50c4df95a74ab2953aec9ff Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 20:37:27 +0800 -Subject: [PATCH v1 07/10] bcache: set error_limit correctly - -Struct cache uses io_errors for two purposes, -- Error decay: when cache set error_decay is set, io_errors is used to - generate a small piece of delay when I/O error happens. -- I/O errors counter: in order to generate big enough value for error - decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a - IO_ERROR_SHIFT). - -In function bch_count_io_errors(), if I/O errors counter reaches cache set -error limit, bch_cache_set_error() will be called to retire the whold cache -set. But current code is problematic when checking the error limit, see the -following code piece from bch_count_io_errors(), - - 90 if (error) { - 91 char buf[BDEVNAME_SIZE]; - 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, - 93 &ca->io_errors); - 94 errors >>= IO_ERROR_SHIFT; - 95 - 96 if (errors < ca->set->error_limit) - 97 pr_err("%s: IO error on %s, recovering", - 98 bdevname(ca->bdev, buf), m); - 99 else -100 bch_cache_set_error(ca->set, -101 "%s: too many IO errors %s", -102 bdevname(ca->bdev, buf), m); -103 } - -At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real -errors counter to compare at line 96. But ca->set->error_limit is initia- -lized with an amplified value in bch_cache_set_alloc(), -1545 c->error_limit = 8 << IO_ERROR_SHIFT; - -It means by default, in bch_count_io_errors(), before 8<<20 errors happened -bch_cache_set_error() won't be called to retire the problematic cache -device. If the average request size is 64KB, it means bcache won't handle -failed device until 512GB data is requested. This is too large to be an I/O -threashold. So I believe the correct error limit should be much less. - -This patch sets default cache set error limit to 8, then in -bch_count_io_errors() when errors counter reaches 8 (if it is default -value), function bch_cache_set_error() will be called to retire the whole -cache set. This patch also removes bits shifting when store or show -io_error_limit value via sysfs interface. - -Nowadays most of SSDs handle internal flash failure automatically by LBA -address re-indirect mapping. If an I/O error can be observed by upper layer -code, it will be a notable error because that SSD can not re-indirect -map the problematic LBA address to an available flash block. This situation -indicates the whole SSD will be failed very soon. Therefore setting 8 as -the default io error limit value makes sense, it is enough for most of -cache devices. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 2 +- - drivers/md/bcache/sysfs.c | 4 ++-- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 395b87942a2f..a31dc3737dae 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -654,6 +654,7 @@ struct cache_set { - ON_ERROR_UNREGISTER, - ON_ERROR_PANIC, - } on_error; -+#define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 8912be4165c5..02d9d7110769 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1561,7 +1561,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; -- c->error_limit = 8 << IO_ERROR_SHIFT; -+ c->error_limit = DEFAULT_IO_ERROR_LIMIT; - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b4184092c727..d7ce9a05b304 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -556,7 +556,7 @@ SHOW(__bch_cache_set) - - /* See count_io_errors for why 88 */ - sysfs_print(io_error_halflife, c->error_decay * 88); -- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); -+ sysfs_print(io_error_limit, c->error_limit); - - sysfs_hprint(congested, - ((uint64_t) bch_get_congested(c)) << 9); -@@ -656,7 +656,7 @@ STORE(__bch_cache_set) - } - - if (attr == &sysfs_io_error_limit) -- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; -+ c->error_limit = strtoul_or_return(buf); - - /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) --- -2.15.1 - diff --git a/for-next/v1/v1-0008-bcache-fix-misleading-error-message-in-bch_count_.patch b/for-next/v1/v1-0008-bcache-fix-misleading-error-message-in-bch_count_.patch deleted file mode 100644 index 18a5c32..0000000 --- a/for-next/v1/v1-0008-bcache-fix-misleading-error-message-in-bch_count_.patch +++ /dev/null @@ -1,118 +0,0 @@ -From 80d7abeee0b81a7ee0e3789bac9580f540437d0e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 15:59:33 +0800 -Subject: [PATCH v1 08/10] bcache: fix misleading error message in - bch_count_io_errors() - -Bcache only does recoverable I/O for read operations by calling -cached_dev_read_error(). For write opertions there is no I/O recovery for -failed requests. - -But in bch_count_io_errors() no matter read or write I/Os, before errors -counter reaches io error limit, pr_err() always prints "IO error on %, -recoverying". For write requests this information is misleading, because -there is no I/O recovery at all. - -This patch adds a parameter 'is_read' to bch_count_io_errors(), and only -prints "recovering" by pr_err() when the bio direction is READ. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 2 +- - drivers/md/bcache/io.c | 13 +++++++++---- - drivers/md/bcache/super.c | 4 +++- - drivers/md/bcache/writeback.c | 4 +++- - 4 files changed, 16 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index a31dc3737dae..c53f312b2216 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -855,7 +855,7 @@ static inline void wake_up_allocators(struct cache_set *c) - - /* Forward declarations */ - --void bch_count_io_errors(struct cache *, blk_status_t, const char *); -+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); - void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index fac97ec2d0e2..a783c5a41ff1 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - - /* IO errors */ - --void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) -+void bch_count_io_errors(struct cache *ca, -+ blk_status_t error, -+ int is_read, -+ const char *m) - { - /* - * The halflife of an error is: -@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) - errors >>= IO_ERROR_SHIFT; - - if (errors < ca->set->error_limit) -- pr_err("%s: IO error on %s, recovering", -- bdevname(ca->bdev, buf), m); -+ pr_err("%s: IO error on %s%s", -+ bdevname(ca->bdev, buf), m, -+ is_read ? ", recovering." : "."); - else - bch_cache_set_error(ca->set, - "%s: too many IO errors %s", -@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - { - struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); -+ int is_read = (bio_data_dir(bio) == READ ? 1 : 0); - - unsigned threshold = op_is_write(bio_op(bio)) - ? c->congested_write_threshold_us -@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - atomic_inc(&c->congested); - } - -- bch_count_io_errors(ca, error, m); -+ bch_count_io_errors(ca, error, is_read, m); - } - - void bch_bbio_endio(struct cache_set *c, struct bio *bio, -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 02d9d7110769..bbe911847eea 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio) - { - struct cache *ca = bio->bi_private; - -- bch_count_io_errors(ca, bio->bi_status, "writing superblock"); -+ /* is_read = 0 */ -+ bch_count_io_errors(ca, bio->bi_status, 0, -+ "writing superblock"); - closure_put(&ca->set->sb_write); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index ab2ac3d72393..e58f9be5ae43 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -228,8 +228,10 @@ static void read_dirty_endio(struct bio *bio) - struct keybuf_key *w = bio->bi_private; - struct dirty_io *io = w->private; - -+ /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), -- bio->bi_status, "reading dirty data from cache"); -+ bio->bi_status, 1, -+ "reading dirty data from cache"); - - dirty_endio(bio); - } --- -2.15.1 - diff --git a/for-next/v1/v1-0009-bcache-add-io_disable-to-struct-cache_set.patch b/for-next/v1/v1-0009-bcache-add-io_disable-to-struct-cache_set.patch deleted file mode 100644 index 1c6159e..0000000 --- a/for-next/v1/v1-0009-bcache-add-io_disable-to-struct-cache_set.patch +++ /dev/null @@ -1,433 +0,0 @@ -From 5996e95d633ad28ebbd113004efc488162cd22b7 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 2 Jan 2018 17:31:07 +0800 -Subject: [PATCH v1 09/10] bcache: add io_disable to struct cache_set - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when io_disable is -true. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether cache_set->io_disable is true. If cache_set->io_disable is -true, closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added for cache_set->io_disable, to read and set -io_disable value for debugging. It is helpful to trigger more corner case -issues for failed cache device. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/alloc.c | 2 +- - drivers/md/bcache/bcache.h | 14 ++++++++++++++ - drivers/md/bcache/btree.c | 6 ++++-- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 7 ++++++- - drivers/md/bcache/sysfs.c | 4 ++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 34 ++++++++++++++++++++++------------ - 10 files changed, 73 insertions(+), 32 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 48c002faf08d..3be737582f27 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -286,7 +286,7 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) \ -+ if (kthread_should_stop() || ca->set->io_disable) \ - return 0; \ - \ - set_current_state(TASK_INTERRUPTIBLE); \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c53f312b2216..9c7f9b1cb791 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -481,6 +481,7 @@ struct cache_set { - struct cache_accounting accounting; - - unsigned long flags; -+ bool io_disable; - - struct cache_sb sb; - -@@ -853,6 +854,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(c->io_disable)) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index bf0d7978bc3d..75470cce1177 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1788,9 +1788,11 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ c->io_disable || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || c->io_disable) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a87165c1d8e5..979873641030 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -653,7 +653,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 643c3021624f..a85d6a605a8e 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -725,7 +725,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -850,7 +850,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -858,7 +858,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -923,7 +923,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -938,12 +938,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -959,7 +959,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -974,6 +974,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && d->c->io_disable)) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - bio_set_dev(bio, dc->bdev); -@@ -1089,6 +1095,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c->io_disable)) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index bbe911847eea..7aa76c3e3556 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1333,6 +1333,10 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - acquire_console_sem(); - */ - -+ c->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ - printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); - - va_start(args, fmt); -@@ -1564,6 +1568,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ c->io_disable = false; - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index d7ce9a05b304..acce7c82e111 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -92,6 +92,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -573,6 +574,7 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", c->io_disable); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -663,6 +665,7 @@ STORE(__bch_cache_set) - c->error_decay = strtoul_or_return(buf) / 88; - - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); -+ sysfs_strtoul_clamp(io_disable, c->io_disable, 0, 1); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); - sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); -@@ -744,6 +747,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index ed5e8a412eb8..03e533631798 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -564,12 +564,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index e58f9be5ae43..54add41d2569 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -93,8 +93,11 @@ static void update_writeback_rate(struct work_struct *work) - writeback_rate_update); - struct cache_set *c = dc->disk.c; - -- /* quit directly if cache set is stopping */ -- if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ /* -+ * quit directly if cache set is stopping. c->io_disable -+ * can be set via sysfs, check it here too. -+ */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags) || c->io_disable) - return; - - down_read(&dc->writeback_lock); -@@ -105,8 +108,11 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- /* do not schedule delayed work if cache set is stopping */ -- if (test_bit(CACHE_SET_STOPPING, &c->flags)) -+ /* -+ * do not schedule delayed work if cache set is stopping, -+ * c->io_disable can be set via sysfs, check it here too. -+ */ -+ if (test_bit(CACHE_SET_STOPPING, &c->flags) || c->io_disable) - return; - - schedule_delayed_work(&dc->writeback_rate_update, -@@ -217,7 +223,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); -@@ -240,7 +246,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -259,7 +265,7 @@ static void read_dirty(struct cached_dev *dc) - * mempools. - */ - -- while (!kthread_should_stop()) { -+ while (!(kthread_should_stop() || dc->disk.c->io_disable)) { - - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) -@@ -269,7 +275,9 @@ static void read_dirty(struct cached_dev *dc) - - if (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50) -- while (!kthread_should_stop() && delay) -+ while (!kthread_should_stop() && -+ !dc->disk.c->io_disable && -+ delay) - delay = schedule_timeout_interruptible(delay); - - dc->last_read = KEY_OFFSET(&w->key); -@@ -450,18 +458,19 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!(kthread_should_stop() || c->io_disable)) { - down_write(&dc->writeback_lock); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || c->io_disable) - break; - - set_current_state(TASK_INTERRUPTIBLE); -@@ -485,8 +494,8 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index) { - unsigned delay = dc->writeback_delay * HZ; - -- while (delay && -- !kthread_should_stop() && -+ while (delay && !kthread_should_stop() && -+ !c->io_disable && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - -@@ -494,6 +503,7 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; - cached_dev_put(dc); - - return 0; --- -2.15.1 - diff --git a/for-next/v1/v1-0010-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v1/v1-0010-bcache-stop-all-attached-bcache-devices-for-a-ret.patch deleted file mode 100644 index 062caae..0000000 --- a/for-next/v1/v1-0010-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 595d5d28a7ed23cae061b9e0dd201611afd6db6d Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 3 Jan 2018 18:24:55 +0800 -Subject: [PATCH v1 10/10] bcache: stop all attached bcache devices for a - retired cache set - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -This patch checkes whether cache_set->io_disable is true in -__cache_set_unregister(). If cache_set->io_disable is true, it means cache -set is unregistering by too many I/O errors, then all attached bcache -devices will be stopped as well. If cache_set->io_disable is not true, it -means __cache_set_unregister() is triggered by writing 1 to sysfs file -/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do -it explicitly, this patch keeps existing behavior and does not stop any -bcache device. - -Even the failed cache device has no dirty data, stopping bcache device is -still a desired behavior by many Ceph and data base users. Then their -application will report I/O errors due to disappeared bcache device, and -operation people will know the cache device is broken or disconnected. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 49d6fedf89c3..20a7a6959506 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1458,6 +1458,14 @@ static void __cache_set_unregister(struct closure *cl) - dc = container_of(c->devices[i], - struct cached_dev, disk); - bch_cached_dev_detach(dc); -+ /* -+ * If we come here by too many I/O errors, -+ * bcache device should be stopped too, to -+ * keep data consistency on cache and -+ * backing devices. -+ */ -+ if (c->io_disable) -+ bcache_device_stop(c->devices[i]); - } else { - bcache_device_stop(c->devices[i]); - } --- -2.15.1 - diff --git a/for-next/v2/v2-0000-cover-letter.patch b/for-next/v2/v2-0000-cover-letter.patch deleted file mode 100644 index 48a8af3..0000000 --- a/for-next/v2/v2-0000-cover-letter.patch +++ /dev/null @@ -1,92 +0,0 @@ -From b586ad82f67e12cb4d2a55681264b5cdf6353c59 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 23:20:33 +0800 -Subject: [PATCH v2 00/12] bcache: device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve bcache device failure handling, including -cache device and backing device failures. - -The basic idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices attached to this cache set -- Stop all bcache devices linked to this cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. - -For failed backing device, there are two ways to handle them, -- If device is disconnected, when kernel thread dc->status_update_thread - finds it is offline for BACKING_DEV_OFFLINE_TIMEOUT (5) seconds, the - kernel thread will set dc->io_disable and call bcache_device_stop() to - stop and remove the bcache device from system. -- If device is connected but too many I/O errors happen, after errors - number exceeds dc->error_limit, call bch_cached_dev_error() to set - dc->io_disable and stop bcache device. Then the broken backing device - and its bcache device will be removed from system. - -The v2 patch set fixes the problems addressed in v1 patch reviews, adds -failure handling for backing device. This patch set also includes a patch -from Junhui Tang. And the v2 patch set does not include 2 patches which are -in bcache-for-next already. - -A basic testing covered with writethrough, writeback, writearound mode, and -read/write/readwrite workloads, cache set or bcache device can be removed -by too many I/O errors or delete the device. For plugging out physical -disks, a kernel bug triggers rcu oops in __do_softirq() and locks up all -following accesses to the disconnected disk, this blocks my testing. - -While posting v2 patch set, I also continue to test the code from my side. -Any comment, question and review are warmly welcome. - -Open issues: -1, Detach backing device by writing sysfs detach file does not work, it is - because writeback thread does not drop dc->count refcount when cache - device turns from dirty into clean. This issue will be fixed in v3 - patch set. -2, A kernel bug in __do_softirq() when plugging out hard disk with heavy - I/O blocks my physical disk disconnection test. If any one knows this - bug, please give me a hint. - -Changelog: -v2: fixes all problems found in v1 review. - add patches to handle backing device failure. - add one more patch to set writeback_rate_update_seconds range. - include a patch from Junhui Tang. -v1: the initial version, only handles cache device failure. - -Coly Li (11): - bcache: set writeback_rate_update_seconds in range [1, 60] seconds - bcache: properly set task state in bch_writeback_thread() - bcache: set task properly in allocator_wait() - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: stop dc->writeback_rate_update properly - bcache: set error_limit correctly - bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags - bcache: stop all attached bcache devices for a retired cache set - bcache: add backing_request_endio() for bi_end_io of attached backing - device I/O - bcache: add io_disable to struct cached_dev - bcache: stop bcache device when backing device is offline - -Tang Junhui (1): - bcache: fix inaccurate io state for detached bcache devices - - drivers/md/bcache/alloc.c | 5 +- - drivers/md/bcache/bcache.h | 37 ++++++++- - drivers/md/bcache/btree.c | 10 ++- - drivers/md/bcache/io.c | 16 +++- - drivers/md/bcache/journal.c | 4 +- - drivers/md/bcache/request.c | 188 +++++++++++++++++++++++++++++++++++------- - drivers/md/bcache/super.c | 134 ++++++++++++++++++++++++++++-- - drivers/md/bcache/sysfs.c | 45 +++++++++- - drivers/md/bcache/util.h | 6 -- - drivers/md/bcache/writeback.c | 79 +++++++++++++++--- - drivers/md/bcache/writeback.h | 5 +- - 11 files changed, 458 insertions(+), 71 deletions(-) - --- -2.15.1 - diff --git a/for-next/v2/v2-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v2/v2-0001-bcache-set-writeback_rate_update_seconds-in-range.patch deleted file mode 100644 index 3bdcb3c..0000000 --- a/for-next/v2/v2-0001-bcache-set-writeback_rate_update_seconds-in-range.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 6641000fb839379fd006ec2e101ea788b65d01b6 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:11:03 +0800 -Subject: [PATCH v2 01/12] bcache: set writeback_rate_update_seconds in range - [1, 60] seconds - -dc->writeback_rate_update_seconds can be set via sysfs and its value can -be set to [1, ULONG_MAX]. It does not make sense to set such a large -value, 60 seconds is long enough value considering the default 5 seconds -works well for long time. - -Because dc->writeback_rate_update is a special delayed work, it re-arms -itself inside the delayed work routine update_writeback_rate(). When -stopping it by cancel_delayed_work_sync(), there should be a timeout to -wait and make sure the re-armed delayed work is stopped too. A small max -value of dc->writeback_rate_update_seconds is also helpful to decide a -reasonable small timeout. - -This patch limits sysfs interface to set dc->writeback_rate_update_seconds -in range of [1, 60] seconds, and replaces the hand-coded number by macros. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/sysfs.c | 3 +++ - drivers/md/bcache/writeback.c | 2 +- - drivers/md/bcache/writeback.h | 3 +++ - 3 files changed, 7 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b4184092c727..a74a752c9e0f 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -215,6 +215,9 @@ STORE(__cached_dev) - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); - -+ sysfs_strtoul_clamp(writeback_rate_update_seconds, -+ dc->writeback_rate_update_seconds, -+ 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 51306a19ab03..0ade883b6316 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -652,7 +652,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 8; - -- dc->writeback_rate_update_seconds = 5; -+ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 66f1c527fa24..587b25599856 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -8,6 +8,9 @@ - #define MAX_WRITEBACKS_IN_PASS 5 - #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ - -+#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 -+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 -+ - /* - * 14 (16384ths) is chosen here as something that each backing device - * should be a reasonable fraction of the share, and not to blow up --- -2.15.1 - diff --git a/for-next/v2/v2-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v2/v2-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch deleted file mode 100644 index 9cd3ab4..0000000 --- a/for-next/v2/v2-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 5dffbbb4a18a8bed0985ead53afa8d14898d1279 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 22:11:01 +0800 -Subject: [PATCH v2 02/12] bcache: properly set task state in - bch_writeback_thread() - -Kernel thread routine bch_writeback_thread() has the following code block, - -447 down_write(&dc->writeback_lock); -448~450 if (check conditions) { -451 up_write(&dc->writeback_lock); -452 set_current_state(TASK_INTERRUPTIBLE); -453 -454 if (kthread_should_stop()) -455 return 0; -456 -457 schedule(); -458 continue; -459 } - -If condition check is true, its task state is set to TASK_INTERRUPTIBLE -and call schedule() to wait for others to wake up it. - -There are 2 issues in current code, -1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if - another process changes the condition and call wake_up_process(dc-> - writeback_thread), then at line 452 task state is set back to - TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be - waken up. -2, At line 454 if kthread_should_stop() is true, writeback kernel thread - will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and - call do_exit(). It is not good to enter do_exit() with task state - TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a - warning message is reported by __might_sleep(): "WARNING: do not call - blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -For the first issue, task state should be set before condition checks. -Ineed because dc->writeback_lock is required when modifying all the -conditions, calling set_current_state() inside code block where dc-> -writeback_lock is hold is safe. But this is quite implicit, so I still move -set_current_state() before all the condition checks. - -For the second issue, frankley speaking it does not hurt when kernel thread -exits with TASK_INTERRUPTIBLE state, but this warning message scares users, -makes them feel there might be something risky with bcache and hurt their -data. Setting task state to TASK_RUNNING before returning fixes this -problem. - -Changelog: -v2: fix the race issue in v1 patch. -v1: initial buggy fix. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 0ade883b6316..f1d2fc15abcc 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) - - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); -+ set_current_state(TASK_INTERRUPTIBLE); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); -- set_current_state(TASK_INTERRUPTIBLE); - -- if (kthread_should_stop()) -+ if (kthread_should_stop()) { -+ set_current_state(TASK_RUNNING); - return 0; -+ } - - schedule(); - continue; - } -+ set_current_state(TASK_RUNNING); - - searched_full_index = refill_dirty(dc); - --- -2.15.1 - diff --git a/for-next/v2/v2-0003-bcache-set-task-properly-in-allocator_wait.patch b/for-next/v2/v2-0003-bcache-set-task-properly-in-allocator_wait.patch deleted file mode 100644 index 81a6e7d..0000000 --- a/for-next/v2/v2-0003-bcache-set-task-properly-in-allocator_wait.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 374b24628212f175ceaf09901c2fd419d55f6962 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 22:45:51 +0800 -Subject: [PATCH v2 03/12] bcache: set task properly in allocator_wait() - -Kernel thread routine bch_allocator_thread() references macro -allocator_wait() to wait for a condition or quit to do_exit() -when kthread_should_stop() is true. Here is the code block, - -284 while (1) { \ -285 set_current_state(TASK_INTERRUPTIBLE); \ -286 if (cond) \ -287 break; \ -288 \ -289 mutex_unlock(&(ca)->set->bucket_lock); \ -290 if (kthread_should_stop()) \ -291 return 0; \ -292 \ -293 schedule(); \ -294 mutex_lock(&(ca)->set->bucket_lock); \ -295 } \ -296 __set_current_state(TASK_RUNNING); \ - -At line 285, task state is set to TASK_INTERRUPTIBLE, if at line 290 -kthread_should_stop() is true, the kernel thread will terminate and return -to kernel/kthread.s:kthread(), then calls do_exit() with TASK_INTERRUPTIBLE -state. This is not a suggested behavior and a warning message will be -reported by might_sleep() in do_exit() code path: "WARNING: do not call -blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -This patch fixes this problem by setting task state to TASK_RUNNING if -kthread_should_stop() is true and before kernel thread returns back to -kernel/kthread.s:kthread(). - -Changelog: -v2: fix the race issue in v1 patch. -v1: initial buggy fix. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/alloc.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 6cc6c0f9c3a9..458e1d38577d 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,8 +287,10 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) \ -+ if (kthread_should_stop()) { \ -+ set_current_state(TASK_RUNNING); \ - return 0; \ -+ } \ - \ - schedule(); \ - mutex_lock(&(ca)->set->bucket_lock); \ --- -2.15.1 - diff --git a/for-next/v2/v2-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v2/v2-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index a452016..0000000 --- a/for-next/v2/v2-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,178 +0,0 @@ -From 00455397a8de16cec8e56292f267f2850a939b15 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 23:05:58 +0800 -Subject: [PATCH v2 04/12] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - } - -+ if (atomic_read(&dc->has_dirty)) -+ cached_dev_put() -+ - return 0; - } -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Changelog: -v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. -v1: inital version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 11 ++++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 133b81225ea9..d14e09cce2f6 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index f1d2fc15abcc..b280c134dd4d 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) - - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); -- return 0; -+ break; - } - - schedule(); -@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 587b25599856..0bba8f1c6cdf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.15.1 - diff --git a/for-next/v2/v2-0005-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v2/v2-0005-bcache-stop-dc-writeback_rate_update-properly.patch deleted file mode 100644 index 6448f1a..0000000 --- a/for-next/v2/v2-0005-bcache-stop-dc-writeback_rate_update-properly.patch +++ /dev/null @@ -1,266 +0,0 @@ -From 36b752f82142be3641fbb60e6b8a79b53ad5419e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:48:39 +0800 -Subject: [PATCH v2 05/12] bcache: stop dc->writeback_rate_update properly - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, this delayed work is -stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -itself. That means when cancel_delayed_work_sync() returns, this delayed -work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will continue and release memory of struct cache set. -Then the delayed work is scheduled to run, __update_writeback_rate() -will reference the already released cache_set memory, and trigger a NULL -pointer deference fault. - -This patch introduces two more bcache device flags, -- BCACHE_DEV_WB_RUNNING - bit set: bcache device is in writeback mode and running, it is OK for - dc->writeback_rate_update to re-arm itself. - bit clear:bcache device is trying to stop dc->writeback_rate_update, - this delayed work should not re-arm itself and quit. -- BCACHE_DEV_RATE_DW_RUNNING - bit set: routine update_writeback_rate() is executing. - bit clear: routine update_writeback_rate() quits. - -This patch also adds a function cancel_writeback_rate_update_dwork() to -wait for dc->writeback_rate_update quits before cancel it by calling -cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected -quit dc->writeback_rate_update, after time_out seconds this function will -give up and continue to call cancel_delayed_work_sync(). - -And here I explain how this patch stops self re-armed delayed work properly -with the above stuffs. - -update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning -and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling -cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. - -Before calling cancel_delayed_work_sync() wait utill flag -BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling -cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- -armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases -delayed work routine update_writeback_rate() won't be executed after -cancel_delayed_work_sync() returns. - -Inside update_writeback_rate() before calling schedule_delayed_work(), flag -BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means -someone is about to stop the delayed work. Because flag -BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() -has to wait for this flag to be cleared, we don't need to worry about race -condition here. - -If update_writeback_rate() is scheduled to run after checking -BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() -in cancel_writeback_rate_update_dwork(), it is also safe. Because at this -moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned -previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear -and quit immediately. - -Because there are more dependences inside update_writeback_rate() to struct -cache_set memory, dc->writeback_rate_update is not a simple self re-arm -delayed work. After trying many different methods (e.g. hold dc->count, or -use locks), this is the only way I can find which works to properly stop -dc->writeback_rate_update delayed work. - -Changelog: -v2: Try to fix the race issue which is pointed out by Junhui. -v1: The initial version for review - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 9 +++++---- - drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 3 ++- - drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- - 4 files changed, 70 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5e2d4e80198e..88d938c8d027 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -258,10 +258,11 @@ struct bcache_device { - struct gendisk *disk; - - unsigned long flags; --#define BCACHE_DEV_CLOSING 0 --#define BCACHE_DEV_DETACHING 1 --#define BCACHE_DEV_UNLINK_DONE 2 -- -+#define BCACHE_DEV_CLOSING 0 -+#define BCACHE_DEV_DETACHING 1 -+#define BCACHE_DEV_UNLINK_DONE 2 -+#define BCACHE_DEV_WB_RUNNING 4 -+#define BCACHE_DEV_RATE_DW_RUNNING 8 - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index d14e09cce2f6..6d888e8fea8c 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) - pr_debug("error creating sysfs link"); - } - -+/* -+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed -+ * work dc->writeback_rate_update is running. Wait until the routine -+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to -+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out -+ * seconds, give up waiting here and continue to cancel it too. -+ */ -+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) -+{ -+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; -+ -+ do { -+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, -+ &dc->disk.flags)) -+ break; -+ time_out--; -+ schedule_timeout_interruptible(1); -+ } while (time_out > 0); -+ -+ if (time_out == 0) -+ pr_warn("bcache: give up waiting for " -+ "dc->writeback_write_update to quit"); -+ -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+} -+ - static void cached_dev_detach_finish(struct work_struct *w) - { - struct cached_dev *dc = container_of(w, struct cached_dev, detach); -@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) - - mutex_lock(&bch_register_lock); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) - closure_get(&dc->disk.cl); - - bch_writeback_queue(dc); -+ - cached_dev_put(dc); - } - -@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ mutex_lock(&bch_register_lock); -+ -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -- mutex_lock(&bch_register_lock); -- - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index a74a752c9e0f..b7166c504cdb 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -304,7 +304,8 @@ STORE(bch_cached_dev) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index b280c134dd4d..69957f97bf13 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev, - writeback_rate_update); - -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ return; -+ } -+ - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && -@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -+ } -+ -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); - } - - static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) -@@ -661,6 +686,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - } - -@@ -679,6 +705,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - return PTR_ERR(dc->writeback_thread); - } - -+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - --- -2.15.1 - diff --git a/for-next/v2/v2-0006-bcache-set-error_limit-correctly.patch b/for-next/v2/v2-0006-bcache-set-error_limit-correctly.patch deleted file mode 100644 index 46124c9..0000000 --- a/for-next/v2/v2-0006-bcache-set-error_limit-correctly.patch +++ /dev/null @@ -1,121 +0,0 @@ -From 868e1b9ae94b4852555b4dcf5990b309c59f798b Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 9 Jan 2018 22:46:25 +0800 -Subject: [PATCH v2 06/12] bcache: set error_limit correctly - -Struct cache uses io_errors for two purposes, -- Error decay: when cache set error_decay is set, io_errors is used to - generate a small piece of delay when I/O error happens. -- I/O errors counter: in order to generate big enough value for error - decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a - IO_ERROR_SHIFT). - -In function bch_count_io_errors(), if I/O errors counter reaches cache set -error limit, bch_cache_set_error() will be called to retire the whold cache -set. But current code is problematic when checking the error limit, see the -following code piece from bch_count_io_errors(), - - 90 if (error) { - 91 char buf[BDEVNAME_SIZE]; - 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, - 93 &ca->io_errors); - 94 errors >>= IO_ERROR_SHIFT; - 95 - 96 if (errors < ca->set->error_limit) - 97 pr_err("%s: IO error on %s, recovering", - 98 bdevname(ca->bdev, buf), m); - 99 else -100 bch_cache_set_error(ca->set, -101 "%s: too many IO errors %s", -102 bdevname(ca->bdev, buf), m); -103 } - -At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real -errors counter to compare at line 96. But ca->set->error_limit is initia- -lized with an amplified value in bch_cache_set_alloc(), -1545 c->error_limit = 8 << IO_ERROR_SHIFT; - -It means by default, in bch_count_io_errors(), before 8<<20 errors happened -bch_cache_set_error() won't be called to retire the problematic cache -device. If the average request size is 64KB, it means bcache won't handle -failed device until 512GB data is requested. This is too large to be an I/O -threashold. So I believe the correct error limit should be much less. - -This patch sets default cache set error limit to 8, then in -bch_count_io_errors() when errors counter reaches 8 (if it is default -value), function bch_cache_set_error() will be called to retire the whole -cache set. This patch also removes bits shifting when store or show -io_error_limit value via sysfs interface. - -Nowadays most of SSDs handle internal flash failure automatically by LBA -address re-indirect mapping. If an I/O error can be observed by upper layer -code, it will be a notable error because that SSD can not re-indirect -map the problematic LBA address to an available flash block. This situation -indicates the whole SSD will be failed very soon. Therefore setting 8 as -the default io error limit value makes sense, it is enough for most of -cache devices. - -Changelog: -v2: add reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 2 +- - drivers/md/bcache/sysfs.c | 4 ++-- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 88d938c8d027..7d7512fa4f09 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -663,6 +663,7 @@ struct cache_set { - ON_ERROR_UNREGISTER, - ON_ERROR_PANIC, - } on_error; -+#define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 6d888e8fea8c..a373648b5d4b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1583,7 +1583,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; -- c->error_limit = 8 << IO_ERROR_SHIFT; -+ c->error_limit = DEFAULT_IO_ERROR_LIMIT; - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b7166c504cdb..ba62e987b503 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -560,7 +560,7 @@ SHOW(__bch_cache_set) - - /* See count_io_errors for why 88 */ - sysfs_print(io_error_halflife, c->error_decay * 88); -- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); -+ sysfs_print(io_error_limit, c->error_limit); - - sysfs_hprint(congested, - ((uint64_t) bch_get_congested(c)) << 9); -@@ -660,7 +660,7 @@ STORE(__bch_cache_set) - } - - if (attr == &sysfs_io_error_limit) -- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; -+ c->error_limit = strtoul_or_return(buf); - - /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) --- -2.15.1 - diff --git a/for-next/v2/v2-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v2/v2-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch deleted file mode 100644 index 37631b2..0000000 --- a/for-next/v2/v2-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ /dev/null @@ -1,489 +0,0 @@ -From f67f7eb1a237ff8409574ddafe8331f6ec3d6b88 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 16:47:40 +0800 -Subject: [PATCH v2 07/12] bcache: add CACHE_SET_IO_DISABLE to struct cache_set - flags - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when flags bit -CACHE_SET_IO_DISABLE is set. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, -closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit -from cache_set->flags, to disable or enable cache set I/O for debugging. It -is helpful to trigger more corner case issues for failed cache device. - -Changelog -v2, -- use cache_set->flags to set io disable bit, suggested by Junhui. -- check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this - is reported and inspired from origal patch of Pavel Vazharov. -v1, initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Pavel Vazharov <freakpv@gmail.com> ---- - drivers/md/bcache/alloc.c | 3 ++- - drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ - drivers/md/bcache/btree.c | 10 +++++++--- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 6 +++++- - drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- - 10 files changed, 101 insertions(+), 29 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 458e1d38577d..004cc3cc6123 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,7 +287,8 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) { \ -+ if (kthread_should_stop() || \ -+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ - set_current_state(TASK_RUNNING); \ - return 0; \ - } \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 7d7512fa4f09..c41736960045 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -475,10 +475,15 @@ struct gc_stat { - * - * CACHE_SET_RUNNING means all cache devices have been registered and journal - * replay is complete. -+ * -+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all -+ * external and internal I/O should be denied when this flag is set. -+ * - */ - #define CACHE_SET_UNREGISTERING 0 - #define CACHE_SET_STOPPING 1 - #define CACHE_SET_RUNNING 2 -+#define CACHE_SET_IO_DISABLE 4 - - struct cache_set { - struct closure cl; -@@ -862,6 +867,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index bf3a48aa9a9a..0a0bc63011b4 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) - - btree_gc_start(c); - -+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ - do { - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); -@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) - - if (ret && ret != -EAGAIN) - pr_warn("gc failed!"); -- } while (ret); -+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - bch_btree_gc_finish(c); - wake_up_allocators(c); -@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a87165c1d8e5..979873641030 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -653,7 +653,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 1a46b41dac70..02296bda6384 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - -@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a373648b5d4b..4204d75aee7b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - test_bit(CACHE_SET_STOPPING, &c->flags)) - return false; - -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE already set"); -+ - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ -@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index ba62e987b503..afb051bcfca1 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -92,6 +92,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -577,6 +578,8 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -666,6 +669,22 @@ STORE(__bch_cache_set) - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; - -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ if (v) { -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already set"); -+ } else { -+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already cleared"); -+ } -+ } -+ - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); -@@ -748,6 +767,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 4df4c5c1cab2..7944eea54fa9 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -565,12 +565,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 69957f97bf13..e97e2afead3e 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; - - /* - * should check BCACHE_DEV_RATE_DW_RUNNING before calling -@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); - -- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); -@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - atomic_set(&dc->writeback_sequence_next, next_sequence); -@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) - - next = bch_keybuf_next(&dc->writeback_keys); - -- while (!kthread_should_stop() && next) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ next) { - size = 0; - nk = 0; - -@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) - } - } - -- while (!kthread_should_stop() && delay) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ delay) { - schedule_timeout_interruptible(delay); - delay = writeback_delay(dc, 0); - } -@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!atomic_read(&dc->has_dirty) || -@@ -595,7 +612,8 @@ static int bch_writeback_thread(void *arg) - !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) { -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - set_current_state(TASK_RUNNING); - break; - } -@@ -623,6 +641,7 @@ static int bch_writeback_thread(void *arg) - - while (delay && - !kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - --- -2.15.1 - diff --git a/for-next/v2/v2-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v2/v2-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch deleted file mode 100644 index 391b334..0000000 --- a/for-next/v2/v2-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ /dev/null @@ -1,67 +0,0 @@ -From d5fe9ac0c5814dbb33ccff476bf927e55a31e216 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 00:26:32 +0800 -Subject: [PATCH v2 08/12] bcache: stop all attached bcache devices for a - retired cache set - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -This patch checkes whether cache_set->io_disable is true in -__cache_set_unregister(). If cache_set->io_disable is true, it means cache -set is unregistering by too many I/O errors, then all attached bcache -devices will be stopped as well. If cache_set->io_disable is not true, it -means __cache_set_unregister() is triggered by writing 1 to sysfs file -/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do -it explicitly, this patch keeps existing behavior and does not stop any -bcache device. - -Even the failed cache device has no dirty data, stopping bcache device is -still a desired behavior by many Ceph and data base users. Then their -application will report I/O errors due to disappeared bcache device, and -operation people will know the cache device is broken or disconnected. - -Changelog: -v2: add reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 4204d75aee7b..97e3bb8e1aee 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl) - dc = container_of(c->devices[i], - struct cached_dev, disk); - bch_cached_dev_detach(dc); -+ /* -+ * If we come here by too many I/O errors, -+ * bcache device should be stopped too, to -+ * keep data consistency on cache and -+ * backing devices. -+ */ -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ bcache_device_stop(c->devices[i]); - } else { - bcache_device_stop(c->devices[i]); - } --- -2.15.1 - diff --git a/for-next/v2/v2-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v2/v2-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch deleted file mode 100644 index 6dc4aad..0000000 --- a/for-next/v2/v2-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ /dev/null @@ -1,118 +0,0 @@ -From 2ba5a1d14df44bfca8f0c27e13328e6766c5b387 Mon Sep 17 00:00:00 2001 -From: Tang Junhui <tang.junhui@zte.com.cn> -Date: Tue, 9 Jan 2018 10:27:11 +0800 -Subject: [PATCH v2 09/12] bcache: fix inaccurate io state for detached bcache - devices - -When we run IO in a detached device, and run iostat to shows IO status, -normally it will show like bellow (Omitted some fields): -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 -bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 -but after IO stopped, there are still very big avgqu-sz and %util -values as bellow: -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 - -The reason for this issue is that, only generic_start_io_acct() called -and no generic_end_io_acct() called for detached device in -cached_dev_make_request(). See the code: -//start generic_start_io_acct() -generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); -if (cached_dev_get(dc)) { - //will callback generic_end_io_acct() -} -else { - //will not call generic_end_io_acct() -} - -This patch calls generic_end_io_acct() in the end of IO for detached -devices, so we can show IO state correctly. - -(Modified to use GFP_NOIO in kzalloc() by Coly Li) - -Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> -Reviewed-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 51 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 02296bda6384..e09c5ae745be 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) - continue_at(cl, cached_dev_bio_complete, NULL); - } - -+struct detached_dev_io_private { -+ struct bcache_device *d; -+ unsigned long start_time; -+ bio_end_io_t *bi_end_io; -+ void *bi_private; -+}; -+ -+static void detatched_dev_end_io(struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ -+ ddip = bio->bi_private; -+ bio->bi_end_io = ddip->bi_end_io; -+ bio->bi_private = ddip->bi_private; -+ -+ generic_end_io_acct(ddip->d->disk->queue, -+ bio_data_dir(bio), -+ &ddip->d->disk->part0, ddip->start_time); -+ -+ kfree(ddip); -+ -+ bio->bi_end_io(bio); -+} -+ -+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ /* -+ * no need to call closure_get(&dc->disk.cl), -+ * because upper layer had already opened bcache device, -+ * which would call closure_get(&dc->disk.cl) -+ */ -+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); -+ ddip->d = d; -+ ddip->start_time = jiffies; -+ ddip->bi_end_io = bio->bi_end_io; -+ ddip->bi_private = bio->bi_private; -+ bio->bi_end_io = detatched_dev_end_io; -+ bio->bi_private = ddip; -+ -+ if ((bio_op(bio) == REQ_OP_DISCARD) && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ bio->bi_end_io(bio); -+ else -+ generic_make_request(bio); -+} -+ - /* Cached devices - read & write stuff */ - - static blk_qc_t cached_dev_make_request(struct request_queue *q, -@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - else - cached_dev_read(dc, s); - } -- } else { -- if ((bio_op(bio) == REQ_OP_DISCARD) && -- !blk_queue_discard(bdev_get_queue(dc->bdev))) -- bio_endio(bio); -- else -- generic_make_request(bio); -- } -+ } else -+ detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; - } --- -2.15.1 - diff --git a/for-next/v2/v2-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v2/v2-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch deleted file mode 100644 index b1462a7..0000000 --- a/for-next/v2/v2-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ /dev/null @@ -1,254 +0,0 @@ -From 2692ba986ec25127ee7ac904db109584ec53d44a Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:01:48 +0800 -Subject: [PATCH v2 10/12] bcache: add backing_request_endio() for bi_end_io of - attached backing device I/O - -In order to catch I/O error of backing device, a separate bi_end_io -call back is required. Then a per backing device counter can record I/O -errors number and retire the backing device if the counter reaches a -per backing device I/O error limit. - -This patch adds backing_request_endio() to bcache backing device I/O code -path, this is a preparation for further complicated backing device failure -handling. So far there is no real code logic change, I make this change a -separate patch to make sure it is stable and reliable for further work. - -Changelog: -v2: indeed this is new added in this patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 95 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/super.c | 1 + - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 81 insertions(+), 16 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index e09c5ae745be..ad4cf71f7eab 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) - } - - op->insert_data_done = true; -+ /* get in bch_data_insert() */ - bio_put(bio); - out: - continue_at(cl, bch_data_insert_keys, op->wq); -@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) - closure_put(cl); - } - -+static void backing_request_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ -+ if (bio->bi_status) { -+ struct search *s = container_of(cl, struct search, cl); -+ /* -+ * If a bio has REQ_PREFLUSH for writeback mode, it is -+ * speically assembled in cached_dev_write() for a non-zero -+ * write request which has REQ_PREFLUSH. we don't set -+ * s->iop.status by this failure, the status will be decided -+ * by result of bch_data_insert() operation. -+ */ -+ if (unlikely(s->iop.writeback && -+ bio->bi_opf & REQ_PREFLUSH)) { -+ char buf[BDEVNAME_SIZE]; -+ -+ bio_devname(bio, buf); -+ pr_err("Can't flush %s: returned bi_status %i", -+ buf, bio->bi_status); -+ } else { -+ /* set to orig_bio->bi_status in bio_complete() */ -+ s->iop.status = bio->bi_status; -+ } -+ s->recoverable = false; -+ /* should count I/O error for backing device here */ -+ } -+ -+ bio_put(bio); -+ closure_put(cl); -+} -+ - static void bio_complete(struct search *s) - { - if (s->orig_bio) { -@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) - } - } - --static void do_bio_hook(struct search *s, struct bio *orig_bio) -+static void do_bio_hook(struct search *s, -+ struct bio *orig_bio, -+ bio_end_io_t *end_io_fn) - { - struct bio *bio = &s->bio.bio; - - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); -- bio->bi_end_io = request_endio; -+ /* -+ * bi_end_io can be set separately somewhere else, e.g. the -+ * variants in, -+ * - cache_bio->bi_end_io from cached_dev_cache_miss() -+ * - n->bi_end_io from cache_lookup_fn() -+ */ -+ bio->bi_end_io = end_io_fn; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, - s = mempool_alloc(d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); -- do_bio_hook(s, bio); -+ do_bio_hook(s, bio, request_endio); - - s->orig_bio = bio; - s->cache_miss = NULL; -@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) - trace_bcache_read_retry(s->orig_bio); - - s->iop.status = 0; -- do_bio_hook(s, s->orig_bio); -+ do_bio_hook(s, s->orig_bio, backing_request_endio); - - /* XXX: invalidate cache */ - -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, bio, cl); - } - -@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - bio_copy_dev(cache_bio, miss); - cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - -- cache_bio->bi_end_io = request_endio; -+ cache_bio->bi_end_io = backing_request_endio; - cache_bio->bi_private = &s->cl; - - bch_bio_map(cache_bio, NULL); -@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: - bio_put(cache_bio); - out_submit: -- miss->bi_end_io = request_endio; -+ miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } -@@ -943,31 +987,48 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - s->iop.bio = s->orig_bio; - bio_get(s->iop.bio); - -- if ((bio_op(bio) != REQ_OP_DISCARD) || -- blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(s->iop.c, bio, cl); -+ if (bio_op(bio) == REQ_OP_DISCARD && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ goto insert_data; -+ -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; -+ closure_bio_submit(s->iop.c, bio, cl); -+ - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; - - if (bio->bi_opf & REQ_PREFLUSH) { -- /* Also need to send a flush to the backing device */ -- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, -- dc->disk.bio_split); -- -+ /* -+ * Also need to send a flush to the backing -+ * device, if failed on backing device. -+ */ -+ struct bio *flush; -+ -+ flush = bio_alloc_bioset(GFP_NOIO, 0, -+ dc->disk.bio_split); -+ if (!flush) { -+ s->iop.status = BLK_STS_RESOURCE; -+ goto insert_data; -+ } - bio_copy_dev(flush, bio); -- flush->bi_end_io = request_endio; -+ flush->bi_end_io = backing_request_endio; - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, flush, cl); - } -+ bch_writeback_add(dc); -+ - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); -- -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - } - -+insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); - } -@@ -981,6 +1042,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1078,6 +1140,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - cached_dev_read(dc, s); - } - } else -+ /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 97e3bb8e1aee..08a0b541a4da 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) - bio->bi_private = dc; - - closure_get(cl); -+ /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index e97e2afead3e..878103b9f2ec 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -+ /* I/O request sent to backing device */ - closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - --- -2.15.1 - diff --git a/for-next/v2/v2-0011-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v2/v2-0011-bcache-add-io_disable-to-struct-cached_dev.patch deleted file mode 100644 index 5158dc4..0000000 --- a/for-next/v2/v2-0011-bcache-add-io_disable-to-struct-cached_dev.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 8631a1c8cf5e224282680bda1d590776f9960a33 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:33:45 +0800 -Subject: [PATCH v2 11/12] bcache: add io_disable to struct cached_dev - -If a bcache device is configured to writeback mode, current code does not -handle write I/O errors on backing devices properly. - -In writeback mode, write request is written to cache device, and -latter being flushed to backing device. If I/O failed when writing from -cache device to the backing device, bcache code just ignores the error and -upper layer code is NOT noticed that the backing device is broken. - -This patch tries to handle backing device failure like how the cache device -failure is handled, -- Add a error counter 'io_errors' and error limit 'error_limit' in struct - cached_dev. Add another io_disable to struct cached_dev to disable I/Os - on the problematic backing device. -- When I/O error happens on backing device, increase io_errors counter. And - if io_errors reaches error_limit, set cache_dev->io_disable to true, and - stop the bcache device. - -The result is, if backing device is broken of disconnected, and I/O errors -reach its error limit, backing device will be disabled and the associated -bcache device will be removed from system. - -Changelog: -v2: indeed this is new added in v2 patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 7 +++++++ - drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 14 ++++++++++++-- - drivers/md/bcache/super.c | 22 ++++++++++++++++++++++ - drivers/md/bcache/sysfs.c | 15 ++++++++++++++- - 5 files changed, 69 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c41736960045..5a811959392d 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -360,6 +360,7 @@ struct cached_dev { - unsigned sequential_cutoff; - unsigned readahead; - -+ unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -@@ -379,6 +380,10 @@ struct cached_dev { - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; -+ -+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 -+ atomic_t io_errors; -+ unsigned error_limit; - }; - - enum alloc_reserve { -@@ -882,6 +887,7 @@ static inline void closure_bio_submit(struct cache_set *c, - - /* Forward declarations */ - -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -@@ -909,6 +915,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); - bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); -+bool bch_cached_dev_error(struct cached_dev *dc); - - __printf(2, 3) - bool bch_cache_set_error(struct cache_set *, const char *, ...); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index 8013ecbcdbda..7fac97ae036e 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - } - - /* IO errors */ -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) -+{ -+ char buf[BDEVNAME_SIZE]; -+ unsigned errors; -+ -+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); -+ -+ errors = atomic_add_return(1, &dc->io_errors); -+ if (errors < dc->error_limit) -+ pr_err("%s: IO error on backing device, unrecoverable", -+ bio_devname(bio, buf)); -+ else -+ bch_cached_dev_error(dc); -+} - - void bch_count_io_errors(struct cache *ca, - blk_status_t error, -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index ad4cf71f7eab..386b388ce296 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) - - if (bio->bi_status) { - struct search *s = container_of(cl, struct search, cl); -+ struct cached_dev *dc = container_of(s->d, -+ struct cached_dev, disk); - /* - * If a bio has REQ_PREFLUSH for writeback mode, it is - * speically assembled in cached_dev_write() for a non-zero -@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) - } - s->recoverable = false; - /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); - } - - bio_put(bio); -@@ -1067,8 +1070,14 @@ static void detatched_dev_end_io(struct bio *bio) - bio_data_dir(bio), - &ddip->d->disk->part0, ddip->start_time); - -- kfree(ddip); -+ if (bio->bi_status) { -+ struct cached_dev *dc = container_of(ddip->d, -+ struct cached_dev, disk); -+ /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); -+ } - -+ kfree(ddip); - bio->bi_end_io(bio); - } - -@@ -1107,7 +1116,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -+ dc->io_disable)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 08a0b541a4da..14fce3623770 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ atomic_set(&dc->io_errors, 0); -+ dc->io_disable = false; -+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -+ - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; -@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - return flash_dev_run(c, u); - } - -+bool bch_cached_dev_error(struct cached_dev *dc) -+{ -+ char name[BDEVNAME_SIZE]; -+ -+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) -+ return false; -+ -+ dc->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ -+ pr_err("bcache: stop %s: too many IO errors on backing device %s\n", -+ dc->disk.name, bdevname(dc->bdev, name)); -+ -+ bcache_device_stop(&dc->disk); -+ return true; -+} -+ - /* Cache set */ - - __printf(2, 3) -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index afb051bcfca1..7288927f2a47 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -131,7 +131,9 @@ SHOW(__bch_cached_dev) - var_print(writeback_delay); - var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); -- -+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_error_limit, "%i", dc->error_limit); -+ sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -223,6 +225,14 @@ STORE(__cached_dev) - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ dc->io_disable = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -330,6 +340,9 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_i_term_inverse, - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_debug, -+ &sysfs_errors, -+ &sysfs_io_error_limit, -+ &sysfs_io_disable, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.15.1 - diff --git a/for-next/v2/v2-0012-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v2/v2-0012-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index 7b3f6b1..0000000 --- a/for-next/v2/v2-0012-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,151 +0,0 @@ -From e32b3038feba37429496f12a997fa59ea889d2ed Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 17:31:44 +0800 -Subject: [PATCH v2 12/12] bcache: stop bcache device when backing device is - offline - -Currently bcache does not handle backing device failure, if backing -device is offline and disconnected from system, its bcache device can still -be accessible. If the bcache device is in writeback mode, I/O requests even -can success if the requests hit on cache device. That is to say, when and -how bcache handles offline backing device is undefined. - -This patch tries to handle backing device offline in a rather simple way, -- Add cached_dev->status_update_thread kernel thread to update backing - device status in every 1 second. -- Add cached_dev->offline_seconds to record how many seconds the backing - device is observed to be offline. If the backing device is offline for - BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and - call bcache_device_stop() to stop the bache device which linked to the - offline backing device. - -Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, -its bcache device will be removed, then user space application writing on -it will get error immediately, and handler the device failure in time. - -This patch is quite simple, does not handle more complicated situations. -Once the bcache device is stopped, users need to recovery the backing -device, register and attach it manually. - -Changelog: -v2: this is new added in v2 patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 57 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5a811959392d..9eedb35d01bc 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -338,6 +338,7 @@ struct cached_dev { - - struct keybuf writeback_keys; - -+ struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of -@@ -384,6 +385,7 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 14fce3623770..85adf1e29d11 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) - c->cached_dev_sectors = sectors; - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 5 -+static int cached_dev_status_update(void *arg) -+{ -+ struct cached_dev *dc = arg; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ /* -+ * If this delayed worker is stopping outside, directly quit here. -+ * dc->io_disable might be set via sysfs interface, so check it -+ * here too. -+ */ -+ while (!kthread_should_stop() && !dc->io_disable) { -+ q = bdev_get_queue(dc->bdev); -+ if (blk_queue_dying(q)) -+ dc->offline_seconds++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), -+ BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing " -+ "device offline", dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ bcache_device_stop(&dc->disk); -+ break; -+ } -+ -+ schedule_timeout_interruptible(HZ); -+ } -+ -+ dc->status_update_thread = NULL; -+ return 0; -+} -+ - void bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; -@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -+ -+ dc->status_update_thread = kthread_run(cached_dev_status_update, -+ dc, -+ "bcache_status_update"); -+ if (IS_ERR(dc->status_update_thread)) { -+ pr_warn("bcache: failed to create bcache_status_update " -+ "kthread, continue to run without monitoring backing " -+ "device status"); -+ } - } - - /* -@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); -+ if (!IS_ERR_OR_NULL(dc->status_update_thread)) -+ kthread_stop(dc->status_update_thread); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); --- -2.15.1 - diff --git a/for-next/v3/v3-0000-cover-letter.patch b/for-next/v3/v3-0000-cover-letter.patch deleted file mode 100644 index 3a5de04..0000000 --- a/for-next/v3/v3-0000-cover-letter.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 93be9a0e7f3112074702dd070c07818b2fe3d568 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 22:21:43 +0800 -Subject: [PATCH v3 00/13] bcache: device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve bcache device failure handling, includes -cache device and backing device failures. - -The basic idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices which are attached to this cache set -- Stop all the detached bcache devices -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. - -For failed backing device, there are two kinds of failures to handle, -- If device is disconnected, and kernel thread dc->status_update_thread - finds it is offline for BACKING_DEV_OFFLINE_TIMEOUT (5) seconds, the - kernel thread will set dc->io_disable and call bcache_device_stop() to - stop and remove the bcache device from system. -- If device is alive but returns too many I/O errors, after errors number - exceeds dc->error_limit, call bch_cached_dev_error() to set - dc->io_disable and stop bcache device. Then the broken backing device - and its bcache device will be removed from system. - -The v3 patch set adds one more patch to fix the detach issue found in -v2 patch set. - -A basic testing covered with writethrough, writeback, writearound mode, and -read/write/readwrite workloads, cache set or bcache device can be removed -by too many I/O errors or delete the device. For plugging out physical -disks, a kernel bug triggers rcu oops in __do_softirq() and locks up all -following accesses to the disconnected disk, this blocks my testing. - -Open issues: -1, A kernel bug in __do_softirq() when plugging out hard disk with heavy - I/O blocks my physical disk disconnection test. This is not problem - introduced from this patch set, if any one knows this bug, please give - me a hint. - -Changelog: -v3: fix detach issue find in v2 patch set. -v2: fixes all problems found in v1 review. - add patches to handle backing device failure. - add one more patch to set writeback_rate_update_seconds range. - include a patch from Junhui Tang. -v1: the initial version, only handles cache device failure. - -Any comment, question and review are warmly welcome. Thanks in advance. - -Coly Li ---- - -Coly Li (12): - bcache: set writeback_rate_update_seconds in range [1, 60] seconds - bcache: properly set task state in bch_writeback_thread() - bcache: set task properly in allocator_wait() - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set - bcache: stop dc->writeback_rate_update properly - bcache: set error_limit correctly - bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags - bcache: stop all attached bcache devices for a retired cache set - bcache: add backing_request_endio() for bi_end_io of attached backing - device I/O - bcache: add io_disable to struct cached_dev - bcache: stop bcache device when backing device is offline - -Tang Junhui (1): - bcache: fix inaccurate io state for detached bcache devices - - drivers/md/bcache/alloc.c | 5 +- - drivers/md/bcache/bcache.h | 37 ++++++++- - drivers/md/bcache/btree.c | 10 ++- - drivers/md/bcache/io.c | 16 +++- - drivers/md/bcache/journal.c | 4 +- - drivers/md/bcache/request.c | 187 +++++++++++++++++++++++++++++++++++------- - drivers/md/bcache/super.c | 134 ++++++++++++++++++++++++++++-- - drivers/md/bcache/sysfs.c | 45 +++++++++- - drivers/md/bcache/util.h | 6 -- - drivers/md/bcache/writeback.c | 99 ++++++++++++++++++---- - drivers/md/bcache/writeback.h | 5 +- - 11 files changed, 474 insertions(+), 74 deletions(-) - --- -2.15.1 - diff --git a/for-next/v3/v3-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v3/v3-0001-bcache-set-writeback_rate_update_seconds-in-range.patch deleted file mode 100644 index 23b0003..0000000 --- a/for-next/v3/v3-0001-bcache-set-writeback_rate_update_seconds-in-range.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 8fc565e105fdfa08d42bde28056088cdb0f93fbc Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:11:03 +0800 -Subject: [PATCH v3 01/13] bcache: set writeback_rate_update_seconds in range - [1, 60] seconds - -dc->writeback_rate_update_seconds can be set via sysfs and its value can -be set to [1, ULONG_MAX]. It does not make sense to set such a large -value, 60 seconds is long enough value considering the default 5 seconds -works well for long time. - -Because dc->writeback_rate_update is a special delayed work, it re-arms -itself inside the delayed work routine update_writeback_rate(). When -stopping it by cancel_delayed_work_sync(), there should be a timeout to -wait and make sure the re-armed delayed work is stopped too. A small max -value of dc->writeback_rate_update_seconds is also helpful to decide a -reasonable small timeout. - -This patch limits sysfs interface to set dc->writeback_rate_update_seconds -in range of [1, 60] seconds, and replaces the hand-coded number by macros. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/sysfs.c | 3 +++ - drivers/md/bcache/writeback.c | 2 +- - drivers/md/bcache/writeback.h | 3 +++ - 3 files changed, 7 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b4184092c727..a74a752c9e0f 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -215,6 +215,9 @@ STORE(__cached_dev) - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); - -+ sysfs_strtoul_clamp(writeback_rate_update_seconds, -+ dc->writeback_rate_update_seconds, -+ 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 51306a19ab03..0ade883b6316 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -652,7 +652,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 8; - -- dc->writeback_rate_update_seconds = 5; -+ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 66f1c527fa24..587b25599856 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -8,6 +8,9 @@ - #define MAX_WRITEBACKS_IN_PASS 5 - #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ - -+#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 -+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 -+ - /* - * 14 (16384ths) is chosen here as something that each backing device - * should be a reasonable fraction of the share, and not to blow up --- -2.15.1 - diff --git a/for-next/v3/v3-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v3/v3-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch deleted file mode 100644 index a2844e3..0000000 --- a/for-next/v3/v3-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 48dd314ed3cad040372dec28ddc55991fb3be870 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 22:11:01 +0800 -Subject: [PATCH v3 02/13] bcache: properly set task state in - bch_writeback_thread() - -Kernel thread routine bch_writeback_thread() has the following code block, - -447 down_write(&dc->writeback_lock); -448~450 if (check conditions) { -451 up_write(&dc->writeback_lock); -452 set_current_state(TASK_INTERRUPTIBLE); -453 -454 if (kthread_should_stop()) -455 return 0; -456 -457 schedule(); -458 continue; -459 } - -If condition check is true, its task state is set to TASK_INTERRUPTIBLE -and call schedule() to wait for others to wake up it. - -There are 2 issues in current code, -1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if - another process changes the condition and call wake_up_process(dc-> - writeback_thread), then at line 452 task state is set back to - TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be - waken up. -2, At line 454 if kthread_should_stop() is true, writeback kernel thread - will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and - call do_exit(). It is not good to enter do_exit() with task state - TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a - warning message is reported by __might_sleep(): "WARNING: do not call - blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -For the first issue, task state should be set before condition checks. -Ineed because dc->writeback_lock is required when modifying all the -conditions, calling set_current_state() inside code block where dc-> -writeback_lock is hold is safe. But this is quite implicit, so I still move -set_current_state() before all the condition checks. - -For the second issue, frankley speaking it does not hurt when kernel thread -exits with TASK_INTERRUPTIBLE state, but this warning message scares users, -makes them feel there might be something risky with bcache and hurt their -data. Setting task state to TASK_RUNNING before returning fixes this -problem. - -Changelog: -v2: fix the race issue in v1 patch. -v1: initial buggy fix. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 0ade883b6316..f1d2fc15abcc 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) - - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); -+ set_current_state(TASK_INTERRUPTIBLE); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); -- set_current_state(TASK_INTERRUPTIBLE); - -- if (kthread_should_stop()) -+ if (kthread_should_stop()) { -+ set_current_state(TASK_RUNNING); - return 0; -+ } - - schedule(); - continue; - } -+ set_current_state(TASK_RUNNING); - - searched_full_index = refill_dirty(dc); - --- -2.15.1 - diff --git a/for-next/v3/v3-0003-bcache-set-task-properly-in-allocator_wait.patch b/for-next/v3/v3-0003-bcache-set-task-properly-in-allocator_wait.patch deleted file mode 100644 index a665eb5..0000000 --- a/for-next/v3/v3-0003-bcache-set-task-properly-in-allocator_wait.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 37adf3e3e864ef985da85787b9662faed23ddf25 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 22:45:51 +0800 -Subject: [PATCH v3 03/13] bcache: set task properly in allocator_wait() - -Kernel thread routine bch_allocator_thread() references macro -allocator_wait() to wait for a condition or quit to do_exit() -when kthread_should_stop() is true. Here is the code block, - -284 while (1) { \ -285 set_current_state(TASK_INTERRUPTIBLE); \ -286 if (cond) \ -287 break; \ -288 \ -289 mutex_unlock(&(ca)->set->bucket_lock); \ -290 if (kthread_should_stop()) \ -291 return 0; \ -292 \ -293 schedule(); \ -294 mutex_lock(&(ca)->set->bucket_lock); \ -295 } \ -296 __set_current_state(TASK_RUNNING); \ - -At line 285, task state is set to TASK_INTERRUPTIBLE, if at line 290 -kthread_should_stop() is true, the kernel thread will terminate and return -to kernel/kthread.s:kthread(), then calls do_exit() with TASK_INTERRUPTIBLE -state. This is not a suggested behavior and a warning message will be -reported by might_sleep() in do_exit() code path: "WARNING: do not call -blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -This patch fixes this problem by setting task state to TASK_RUNNING if -kthread_should_stop() is true and before kernel thread returns back to -kernel/kthread.s:kthread(). - -Changelog: -v2: fix the race issue in v1 patch. -v1: initial buggy fix. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/alloc.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 6cc6c0f9c3a9..458e1d38577d 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,8 +287,10 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) \ -+ if (kthread_should_stop()) { \ -+ set_current_state(TASK_RUNNING); \ - return 0; \ -+ } \ - \ - schedule(); \ - mutex_lock(&(ca)->set->bucket_lock); \ --- -2.15.1 - diff --git a/for-next/v3/v3-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v3/v3-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index 2aa735e..0000000 --- a/for-next/v3/v3-0004-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,178 +0,0 @@ -From fffa2563deae795ee82d5e46f089d68ca13a9864 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 23:05:58 +0800 -Subject: [PATCH v3 04/13] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - } - -+ if (atomic_read(&dc->has_dirty)) -+ cached_dev_put() -+ - return 0; - } -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Changelog: -v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 11 ++++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 133b81225ea9..d14e09cce2f6 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index f1d2fc15abcc..b280c134dd4d 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) - - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); -- return 0; -+ break; - } - - schedule(); -@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 587b25599856..0bba8f1c6cdf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.15.1 - diff --git a/for-next/v3/v3-0005-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v3/v3-0005-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch deleted file mode 100644 index 3d5bf0c..0000000 --- a/for-next/v3/v3-0005-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 36747b38396db80de96d650e88f79cc82d284dff Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 21:41:57 +0800 -Subject: [PATCH v3 05/13] bcache: quit dc->writeback_thread when - BCACHE_DEV_DETACHING is set - -In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()", -cached_dev_get() is called when creating dc->writeback_thread, and -cached_dev_put() is called when exiting dc->writeback_thread. This -modification works well unless people detach the bcache device manually by - 'echo 1 > /sys/block/bcache<N>/bcache/detach' -Because this sysfs interface only calls bch_cached_dev_detach() which wakes -up dc->writeback_thread but does not stop it. The reason is, before patch -"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside -bch_writeback_thread(), if cache is not dirty after writeback, -cached_dev_put() will be called here. And in cached_dev_make_request() when -a new write request makes cache from clean to dirty, cached_dev_get() will -be called there. Since we don't operate dc->count in these locations, -refcount d->count cannot be dropped after cache becomes clean, and -cached_dev_detach_finish() won't be called to detach bcache device. - -This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is -set inside bch_writeback_thread(). If this bit is set and cache is clean -(no existing writeback_keys), break the while-loop, call cached_dev_put() -and quit the writeback thread. - -Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the -writeback thread should continue to perform writeback, this is the original -design of manually detach. - -I compose a separte patch because that patch "bcache: fix cached_dev->count -usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes -Reinecke. Also this fix is not trivial and good for a separate patch. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Huijun Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 20 +++++++++++++++++--- - 1 file changed, 17 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index b280c134dd4d..4dbeaaa575bf 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg) - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); -- if (!atomic_read(&dc->has_dirty) || -- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -- !dc->writeback_running)) { -+ /* -+ * If the bache device is detaching, skip here and continue -+ * to perform writeback. Otherwise, if no dirty data on cache, -+ * or there is dirty data on cache but writeback is disabled, -+ * the writeback thread should sleep here and wait for others -+ * to wake up it. -+ */ -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - - if (kthread_should_stop()) { -@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg) - atomic_set(&dc->has_dirty, 0); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); -+ /* -+ * If bcache device is detaching via sysfs interface, -+ * writeback thread should stop after there is no dirty -+ * data on cache. BCACHE_DEV_DETACHING flag is set in -+ * bch_cached_dev_detach(). -+ */ -+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) -+ break; - } - - up_write(&dc->writeback_lock); --- -2.15.1 - diff --git a/for-next/v3/v3-0006-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v3/v3-0006-bcache-stop-dc-writeback_rate_update-properly.patch deleted file mode 100644 index 2859d42..0000000 --- a/for-next/v3/v3-0006-bcache-stop-dc-writeback_rate_update-properly.patch +++ /dev/null @@ -1,266 +0,0 @@ -From c89453e05ab7c96442a17e8aa634d82719534125 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:48:39 +0800 -Subject: [PATCH v3 06/13] bcache: stop dc->writeback_rate_update properly - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, this delayed work is -stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -itself. That means when cancel_delayed_work_sync() returns, this delayed -work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will continue and release memory of struct cache set. -Then the delayed work is scheduled to run, __update_writeback_rate() -will reference the already released cache_set memory, and trigger a NULL -pointer deference fault. - -This patch introduces two more bcache device flags, -- BCACHE_DEV_WB_RUNNING - bit set: bcache device is in writeback mode and running, it is OK for - dc->writeback_rate_update to re-arm itself. - bit clear:bcache device is trying to stop dc->writeback_rate_update, - this delayed work should not re-arm itself and quit. -- BCACHE_DEV_RATE_DW_RUNNING - bit set: routine update_writeback_rate() is executing. - bit clear: routine update_writeback_rate() quits. - -This patch also adds a function cancel_writeback_rate_update_dwork() to -wait for dc->writeback_rate_update quits before cancel it by calling -cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected -quit dc->writeback_rate_update, after time_out seconds this function will -give up and continue to call cancel_delayed_work_sync(). - -And here I explain how this patch stops self re-armed delayed work properly -with the above stuffs. - -update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning -and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling -cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. - -Before calling cancel_delayed_work_sync() wait utill flag -BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling -cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- -armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases -delayed work routine update_writeback_rate() won't be executed after -cancel_delayed_work_sync() returns. - -Inside update_writeback_rate() before calling schedule_delayed_work(), flag -BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means -someone is about to stop the delayed work. Because flag -BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() -has to wait for this flag to be cleared, we don't need to worry about race -condition here. - -If update_writeback_rate() is scheduled to run after checking -BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() -in cancel_writeback_rate_update_dwork(), it is also safe. Because at this -moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned -previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear -and quit immediately. - -Because there are more dependences inside update_writeback_rate() to struct -cache_set memory, dc->writeback_rate_update is not a simple self re-arm -delayed work. After trying many different methods (e.g. hold dc->count, or -use locks), this is the only way I can find which works to properly stop -dc->writeback_rate_update delayed work. - -Changelog: -v2: Try to fix the race issue which is pointed out by Junhui. -v1: The initial version for review - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 9 +++++---- - drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 3 ++- - drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- - 4 files changed, 70 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5e2d4e80198e..88d938c8d027 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -258,10 +258,11 @@ struct bcache_device { - struct gendisk *disk; - - unsigned long flags; --#define BCACHE_DEV_CLOSING 0 --#define BCACHE_DEV_DETACHING 1 --#define BCACHE_DEV_UNLINK_DONE 2 -- -+#define BCACHE_DEV_CLOSING 0 -+#define BCACHE_DEV_DETACHING 1 -+#define BCACHE_DEV_UNLINK_DONE 2 -+#define BCACHE_DEV_WB_RUNNING 4 -+#define BCACHE_DEV_RATE_DW_RUNNING 8 - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index d14e09cce2f6..6d888e8fea8c 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) - pr_debug("error creating sysfs link"); - } - -+/* -+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed -+ * work dc->writeback_rate_update is running. Wait until the routine -+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to -+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out -+ * seconds, give up waiting here and continue to cancel it too. -+ */ -+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) -+{ -+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; -+ -+ do { -+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, -+ &dc->disk.flags)) -+ break; -+ time_out--; -+ schedule_timeout_interruptible(1); -+ } while (time_out > 0); -+ -+ if (time_out == 0) -+ pr_warn("bcache: give up waiting for " -+ "dc->writeback_write_update to quit"); -+ -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+} -+ - static void cached_dev_detach_finish(struct work_struct *w) - { - struct cached_dev *dc = container_of(w, struct cached_dev, detach); -@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) - - mutex_lock(&bch_register_lock); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) - closure_get(&dc->disk.cl); - - bch_writeback_queue(dc); -+ - cached_dev_put(dc); - } - -@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ mutex_lock(&bch_register_lock); -+ -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -- mutex_lock(&bch_register_lock); -- - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index a74a752c9e0f..b7166c504cdb 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -304,7 +304,8 @@ STORE(bch_cached_dev) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 4dbeaaa575bf..8f98ef1038d3 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev, - writeback_rate_update); - -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ return; -+ } -+ - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && -@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -+ } -+ -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); - } - - static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) -@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - } - -@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - return PTR_ERR(dc->writeback_thread); - } - -+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - --- -2.15.1 - diff --git a/for-next/v3/v3-0007-bcache-set-error_limit-correctly.patch b/for-next/v3/v3-0007-bcache-set-error_limit-correctly.patch deleted file mode 100644 index 84acd7c..0000000 --- a/for-next/v3/v3-0007-bcache-set-error_limit-correctly.patch +++ /dev/null @@ -1,121 +0,0 @@ -From be7b78080d36c040af2cef65ab08a5df77122248 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 9 Jan 2018 22:46:25 +0800 -Subject: [PATCH v3 07/13] bcache: set error_limit correctly - -Struct cache uses io_errors for two purposes, -- Error decay: when cache set error_decay is set, io_errors is used to - generate a small piece of delay when I/O error happens. -- I/O errors counter: in order to generate big enough value for error - decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a - IO_ERROR_SHIFT). - -In function bch_count_io_errors(), if I/O errors counter reaches cache set -error limit, bch_cache_set_error() will be called to retire the whold cache -set. But current code is problematic when checking the error limit, see the -following code piece from bch_count_io_errors(), - - 90 if (error) { - 91 char buf[BDEVNAME_SIZE]; - 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, - 93 &ca->io_errors); - 94 errors >>= IO_ERROR_SHIFT; - 95 - 96 if (errors < ca->set->error_limit) - 97 pr_err("%s: IO error on %s, recovering", - 98 bdevname(ca->bdev, buf), m); - 99 else -100 bch_cache_set_error(ca->set, -101 "%s: too many IO errors %s", -102 bdevname(ca->bdev, buf), m); -103 } - -At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real -errors counter to compare at line 96. But ca->set->error_limit is initia- -lized with an amplified value in bch_cache_set_alloc(), -1545 c->error_limit = 8 << IO_ERROR_SHIFT; - -It means by default, in bch_count_io_errors(), before 8<<20 errors happened -bch_cache_set_error() won't be called to retire the problematic cache -device. If the average request size is 64KB, it means bcache won't handle -failed device until 512GB data is requested. This is too large to be an I/O -threashold. So I believe the correct error limit should be much less. - -This patch sets default cache set error limit to 8, then in -bch_count_io_errors() when errors counter reaches 8 (if it is default -value), function bch_cache_set_error() will be called to retire the whole -cache set. This patch also removes bits shifting when store or show -io_error_limit value via sysfs interface. - -Nowadays most of SSDs handle internal flash failure automatically by LBA -address re-indirect mapping. If an I/O error can be observed by upper layer -code, it will be a notable error because that SSD can not re-indirect -map the problematic LBA address to an available flash block. This situation -indicates the whole SSD will be failed very soon. Therefore setting 8 as -the default io error limit value makes sense, it is enough for most of -cache devices. - -Changelog: -v2: add reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 2 +- - drivers/md/bcache/sysfs.c | 4 ++-- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 88d938c8d027..7d7512fa4f09 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -663,6 +663,7 @@ struct cache_set { - ON_ERROR_UNREGISTER, - ON_ERROR_PANIC, - } on_error; -+#define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 6d888e8fea8c..a373648b5d4b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1583,7 +1583,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; -- c->error_limit = 8 << IO_ERROR_SHIFT; -+ c->error_limit = DEFAULT_IO_ERROR_LIMIT; - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b7166c504cdb..ba62e987b503 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -560,7 +560,7 @@ SHOW(__bch_cache_set) - - /* See count_io_errors for why 88 */ - sysfs_print(io_error_halflife, c->error_decay * 88); -- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); -+ sysfs_print(io_error_limit, c->error_limit); - - sysfs_hprint(congested, - ((uint64_t) bch_get_congested(c)) << 9); -@@ -660,7 +660,7 @@ STORE(__bch_cache_set) - } - - if (attr == &sysfs_io_error_limit) -- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; -+ c->error_limit = strtoul_or_return(buf); - - /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) --- -2.15.1 - diff --git a/for-next/v3/v3-0008-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v3/v3-0008-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch deleted file mode 100644 index 7d51f20..0000000 --- a/for-next/v3/v3-0008-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ /dev/null @@ -1,489 +0,0 @@ -From 8533ead6738e3ece36d111fb0aac7e37bc7e7a7c Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 22:15:00 +0800 -Subject: [PATCH v3 08/13] bcache: add CACHE_SET_IO_DISABLE to struct cache_set - flags - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when flags bit -CACHE_SET_IO_DISABLE is set. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, -closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit -from cache_set->flags, to disable or enable cache set I/O for debugging. It -is helpful to trigger more corner case issues for failed cache device. - -Changelog -v2, more changes by previous review, -- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. -- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this - is reported and inspired from origal patch of Pavel Vazharov. -v1, initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Pavel Vazharov <freakpv@gmail.com> ---- - drivers/md/bcache/alloc.c | 3 ++- - drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ - drivers/md/bcache/btree.c | 10 +++++++--- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 6 +++++- - drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- - 10 files changed, 101 insertions(+), 29 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 458e1d38577d..004cc3cc6123 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,7 +287,8 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) { \ -+ if (kthread_should_stop() || \ -+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ - set_current_state(TASK_RUNNING); \ - return 0; \ - } \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 7d7512fa4f09..c41736960045 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -475,10 +475,15 @@ struct gc_stat { - * - * CACHE_SET_RUNNING means all cache devices have been registered and journal - * replay is complete. -+ * -+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all -+ * external and internal I/O should be denied when this flag is set. -+ * - */ - #define CACHE_SET_UNREGISTERING 0 - #define CACHE_SET_STOPPING 1 - #define CACHE_SET_RUNNING 2 -+#define CACHE_SET_IO_DISABLE 4 - - struct cache_set { - struct closure cl; -@@ -862,6 +867,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index bf3a48aa9a9a..0a0bc63011b4 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) - - btree_gc_start(c); - -+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ - do { - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); -@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) - - if (ret && ret != -EAGAIN) - pr_warn("gc failed!"); -- } while (ret); -+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - bch_btree_gc_finish(c); - wake_up_allocators(c); -@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a87165c1d8e5..979873641030 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -653,7 +653,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 1a46b41dac70..02296bda6384 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - -@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a373648b5d4b..4204d75aee7b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - test_bit(CACHE_SET_STOPPING, &c->flags)) - return false; - -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE already set"); -+ - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ -@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index ba62e987b503..afb051bcfca1 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -92,6 +92,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -577,6 +578,8 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -666,6 +669,22 @@ STORE(__bch_cache_set) - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; - -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ if (v) { -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already set"); -+ } else { -+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already cleared"); -+ } -+ } -+ - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); -@@ -748,6 +767,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 4df4c5c1cab2..7944eea54fa9 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -565,12 +565,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 8f98ef1038d3..3d7d8452e0de 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; - - /* - * should check BCACHE_DEV_RATE_DW_RUNNING before calling -@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); - -- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); -@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - atomic_set(&dc->writeback_sequence_next, next_sequence); -@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) - - next = bch_keybuf_next(&dc->writeback_keys); - -- while (!kthread_should_stop() && next) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ next) { - size = 0; - nk = 0; - -@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) - } - } - -- while (!kthread_should_stop() && delay) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ delay) { - schedule_timeout_interruptible(delay); - delay = writeback_delay(dc, 0); - } -@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - /* -@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg) - (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) { -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - set_current_state(TASK_RUNNING); - break; - } -@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg) - - while (delay && - !kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - --- -2.15.1 - diff --git a/for-next/v3/v3-0009-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v3/v3-0009-bcache-stop-all-attached-bcache-devices-for-a-ret.patch deleted file mode 100644 index 0246cef..0000000 --- a/for-next/v3/v3-0009-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ /dev/null @@ -1,67 +0,0 @@ -From c5e03551019bb14ac40adf1b9e52bc6430c8659f Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 00:26:32 +0800 -Subject: [PATCH v3 09/13] bcache: stop all attached bcache devices for a - retired cache set - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -This patch checkes whether cache_set->io_disable is true in -__cache_set_unregister(). If cache_set->io_disable is true, it means cache -set is unregistering by too many I/O errors, then all attached bcache -devices will be stopped as well. If cache_set->io_disable is not true, it -means __cache_set_unregister() is triggered by writing 1 to sysfs file -/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do -it explicitly, this patch keeps existing behavior and does not stop any -bcache device. - -Even the failed cache device has no dirty data, stopping bcache device is -still a desired behavior by many Ceph and data base users. Then their -application will report I/O errors due to disappeared bcache device, and -operation people will know the cache device is broken or disconnected. - -Changelog: -v2: add Reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 4204d75aee7b..97e3bb8e1aee 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl) - dc = container_of(c->devices[i], - struct cached_dev, disk); - bch_cached_dev_detach(dc); -+ /* -+ * If we come here by too many I/O errors, -+ * bcache device should be stopped too, to -+ * keep data consistency on cache and -+ * backing devices. -+ */ -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ bcache_device_stop(c->devices[i]); - } else { - bcache_device_stop(c->devices[i]); - } --- -2.15.1 - diff --git a/for-next/v3/v3-0010-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v3/v3-0010-bcache-fix-inaccurate-io-state-for-detached-bcach.patch deleted file mode 100644 index c42c832..0000000 --- a/for-next/v3/v3-0010-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ /dev/null @@ -1,118 +0,0 @@ -From 9d7ba78bd2b2b109ff1bf3eac21e962e183175b3 Mon Sep 17 00:00:00 2001 -From: Tang Junhui <tang.junhui@zte.com.cn> -Date: Tue, 9 Jan 2018 10:27:11 +0800 -Subject: [PATCH v3 10/13] bcache: fix inaccurate io state for detached bcache - devices - -When we run IO in a detached device, and run iostat to shows IO status, -normally it will show like bellow (Omitted some fields): -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 -bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 -but after IO stopped, there are still very big avgqu-sz and %util -values as bellow: -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 - -The reason for this issue is that, only generic_start_io_acct() called -and no generic_end_io_acct() called for detached device in -cached_dev_make_request(). See the code: -//start generic_start_io_acct() -generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); -if (cached_dev_get(dc)) { - //will callback generic_end_io_acct() -} -else { - //will not call generic_end_io_acct() -} - -This patch calls generic_end_io_acct() in the end of IO for detached -devices, so we can show IO state correctly. - -(Modified to use GFP_NOIO in kzalloc() by Coly Li) - -Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> -Reviewed-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 51 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 02296bda6384..e09c5ae745be 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) - continue_at(cl, cached_dev_bio_complete, NULL); - } - -+struct detached_dev_io_private { -+ struct bcache_device *d; -+ unsigned long start_time; -+ bio_end_io_t *bi_end_io; -+ void *bi_private; -+}; -+ -+static void detatched_dev_end_io(struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ -+ ddip = bio->bi_private; -+ bio->bi_end_io = ddip->bi_end_io; -+ bio->bi_private = ddip->bi_private; -+ -+ generic_end_io_acct(ddip->d->disk->queue, -+ bio_data_dir(bio), -+ &ddip->d->disk->part0, ddip->start_time); -+ -+ kfree(ddip); -+ -+ bio->bi_end_io(bio); -+} -+ -+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ /* -+ * no need to call closure_get(&dc->disk.cl), -+ * because upper layer had already opened bcache device, -+ * which would call closure_get(&dc->disk.cl) -+ */ -+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); -+ ddip->d = d; -+ ddip->start_time = jiffies; -+ ddip->bi_end_io = bio->bi_end_io; -+ ddip->bi_private = bio->bi_private; -+ bio->bi_end_io = detatched_dev_end_io; -+ bio->bi_private = ddip; -+ -+ if ((bio_op(bio) == REQ_OP_DISCARD) && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ bio->bi_end_io(bio); -+ else -+ generic_make_request(bio); -+} -+ - /* Cached devices - read & write stuff */ - - static blk_qc_t cached_dev_make_request(struct request_queue *q, -@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - else - cached_dev_read(dc, s); - } -- } else { -- if ((bio_op(bio) == REQ_OP_DISCARD) && -- !blk_queue_discard(bdev_get_queue(dc->bdev))) -- bio_endio(bio); -- else -- generic_make_request(bio); -- } -+ } else -+ detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; - } --- -2.15.1 - diff --git a/for-next/v3/v3-0011-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v3/v3-0011-bcache-add-backing_request_endio-for-bi_end_io-of.patch deleted file mode 100644 index a151def..0000000 --- a/for-next/v3/v3-0011-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ /dev/null @@ -1,251 +0,0 @@ -From 840af1de0d2ba8f0f6fd148574d3c0a64c63943e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:01:48 +0800 -Subject: [PATCH v3 11/13] bcache: add backing_request_endio() for bi_end_io of - attached backing device I/O - -In order to catch I/O error of backing device, a separate bi_end_io -call back is required. Then a per backing device counter can record I/O -errors number and retire the backing device if the counter reaches a -per backing device I/O error limit. - -This patch adds backing_request_endio() to bcache backing device I/O code -path, this is a preparation for further complicated backing device failure -handling. So far there is no real code logic change, I make this change a -separate patch to make sure it is stable and reliable for further work. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 95 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/super.c | 1 + - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 81 insertions(+), 16 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index e09c5ae745be..ad4cf71f7eab 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) - } - - op->insert_data_done = true; -+ /* get in bch_data_insert() */ - bio_put(bio); - out: - continue_at(cl, bch_data_insert_keys, op->wq); -@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) - closure_put(cl); - } - -+static void backing_request_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ -+ if (bio->bi_status) { -+ struct search *s = container_of(cl, struct search, cl); -+ /* -+ * If a bio has REQ_PREFLUSH for writeback mode, it is -+ * speically assembled in cached_dev_write() for a non-zero -+ * write request which has REQ_PREFLUSH. we don't set -+ * s->iop.status by this failure, the status will be decided -+ * by result of bch_data_insert() operation. -+ */ -+ if (unlikely(s->iop.writeback && -+ bio->bi_opf & REQ_PREFLUSH)) { -+ char buf[BDEVNAME_SIZE]; -+ -+ bio_devname(bio, buf); -+ pr_err("Can't flush %s: returned bi_status %i", -+ buf, bio->bi_status); -+ } else { -+ /* set to orig_bio->bi_status in bio_complete() */ -+ s->iop.status = bio->bi_status; -+ } -+ s->recoverable = false; -+ /* should count I/O error for backing device here */ -+ } -+ -+ bio_put(bio); -+ closure_put(cl); -+} -+ - static void bio_complete(struct search *s) - { - if (s->orig_bio) { -@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) - } - } - --static void do_bio_hook(struct search *s, struct bio *orig_bio) -+static void do_bio_hook(struct search *s, -+ struct bio *orig_bio, -+ bio_end_io_t *end_io_fn) - { - struct bio *bio = &s->bio.bio; - - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); -- bio->bi_end_io = request_endio; -+ /* -+ * bi_end_io can be set separately somewhere else, e.g. the -+ * variants in, -+ * - cache_bio->bi_end_io from cached_dev_cache_miss() -+ * - n->bi_end_io from cache_lookup_fn() -+ */ -+ bio->bi_end_io = end_io_fn; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, - s = mempool_alloc(d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); -- do_bio_hook(s, bio); -+ do_bio_hook(s, bio, request_endio); - - s->orig_bio = bio; - s->cache_miss = NULL; -@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) - trace_bcache_read_retry(s->orig_bio); - - s->iop.status = 0; -- do_bio_hook(s, s->orig_bio); -+ do_bio_hook(s, s->orig_bio, backing_request_endio); - - /* XXX: invalidate cache */ - -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, bio, cl); - } - -@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - bio_copy_dev(cache_bio, miss); - cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - -- cache_bio->bi_end_io = request_endio; -+ cache_bio->bi_end_io = backing_request_endio; - cache_bio->bi_private = &s->cl; - - bch_bio_map(cache_bio, NULL); -@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: - bio_put(cache_bio); - out_submit: -- miss->bi_end_io = request_endio; -+ miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } -@@ -943,31 +987,48 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - s->iop.bio = s->orig_bio; - bio_get(s->iop.bio); - -- if ((bio_op(bio) != REQ_OP_DISCARD) || -- blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(s->iop.c, bio, cl); -+ if (bio_op(bio) == REQ_OP_DISCARD && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ goto insert_data; -+ -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; -+ closure_bio_submit(s->iop.c, bio, cl); -+ - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; - - if (bio->bi_opf & REQ_PREFLUSH) { -- /* Also need to send a flush to the backing device */ -- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, -- dc->disk.bio_split); -- -+ /* -+ * Also need to send a flush to the backing -+ * device, if failed on backing device. -+ */ -+ struct bio *flush; -+ -+ flush = bio_alloc_bioset(GFP_NOIO, 0, -+ dc->disk.bio_split); -+ if (!flush) { -+ s->iop.status = BLK_STS_RESOURCE; -+ goto insert_data; -+ } - bio_copy_dev(flush, bio); -- flush->bi_end_io = request_endio; -+ flush->bi_end_io = backing_request_endio; - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, flush, cl); - } -+ bch_writeback_add(dc); -+ - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); -- -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - } - -+insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); - } -@@ -981,6 +1042,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1078,6 +1140,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - cached_dev_read(dc, s); - } - } else -+ /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 97e3bb8e1aee..08a0b541a4da 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) - bio->bi_private = dc; - - closure_get(cl); -+ /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 3d7d8452e0de..4ebe0119ea7e 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -+ /* I/O request sent to backing device */ - closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - --- -2.15.1 - diff --git a/for-next/v3/v3-0012-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v3/v3-0012-bcache-add-io_disable-to-struct-cached_dev.patch deleted file mode 100644 index bbb79ec..0000000 --- a/for-next/v3/v3-0012-bcache-add-io_disable-to-struct-cached_dev.patch +++ /dev/null @@ -1,232 +0,0 @@ -From 662e22f2afdb792c184fc82bd9f6515e4aa5eb0c Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:33:45 +0800 -Subject: [PATCH v3 12/13] bcache: add io_disable to struct cached_dev - -If a bcache device is configured to writeback mode, current code does not -handle write I/O errors on backing devices properly. - -In writeback mode, write request is written to cache device, and -latter being flushed to backing device. If I/O failed when writing from -cache device to the backing device, bcache code just ignores the error and -upper layer code is NOT noticed that the backing device is broken. - -This patch tries to handle backing device failure like how the cache device -failure is handled, -- Add a error counter 'io_errors' and error limit 'error_limit' in struct - cached_dev. Add another io_disable to struct cached_dev to disable I/Os - on the problematic backing device. -- When I/O error happens on backing device, increase io_errors counter. And - if io_errors reaches error_limit, set cache_dev->io_disable to true, and - stop the bcache device. - -The result is, if backing device is broken of disconnected, and I/O errors -reach its error limit, backing device will be disabled and the associated -bcache device will be removed from system. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 7 +++++++ - drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 14 ++++++++++++-- - drivers/md/bcache/super.c | 22 ++++++++++++++++++++++ - drivers/md/bcache/sysfs.c | 15 ++++++++++++++- - 5 files changed, 69 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c41736960045..5a811959392d 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -360,6 +360,7 @@ struct cached_dev { - unsigned sequential_cutoff; - unsigned readahead; - -+ unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -@@ -379,6 +380,10 @@ struct cached_dev { - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; -+ -+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 -+ atomic_t io_errors; -+ unsigned error_limit; - }; - - enum alloc_reserve { -@@ -882,6 +887,7 @@ static inline void closure_bio_submit(struct cache_set *c, - - /* Forward declarations */ - -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -@@ -909,6 +915,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); - bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); -+bool bch_cached_dev_error(struct cached_dev *dc); - - __printf(2, 3) - bool bch_cache_set_error(struct cache_set *, const char *, ...); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index 8013ecbcdbda..7fac97ae036e 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - } - - /* IO errors */ -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) -+{ -+ char buf[BDEVNAME_SIZE]; -+ unsigned errors; -+ -+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); -+ -+ errors = atomic_add_return(1, &dc->io_errors); -+ if (errors < dc->error_limit) -+ pr_err("%s: IO error on backing device, unrecoverable", -+ bio_devname(bio, buf)); -+ else -+ bch_cached_dev_error(dc); -+} - - void bch_count_io_errors(struct cache *ca, - blk_status_t error, -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index ad4cf71f7eab..386b388ce296 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) - - if (bio->bi_status) { - struct search *s = container_of(cl, struct search, cl); -+ struct cached_dev *dc = container_of(s->d, -+ struct cached_dev, disk); - /* - * If a bio has REQ_PREFLUSH for writeback mode, it is - * speically assembled in cached_dev_write() for a non-zero -@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) - } - s->recoverable = false; - /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); - } - - bio_put(bio); -@@ -1067,8 +1070,14 @@ static void detatched_dev_end_io(struct bio *bio) - bio_data_dir(bio), - &ddip->d->disk->part0, ddip->start_time); - -- kfree(ddip); -+ if (bio->bi_status) { -+ struct cached_dev *dc = container_of(ddip->d, -+ struct cached_dev, disk); -+ /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); -+ } - -+ kfree(ddip); - bio->bi_end_io(bio); - } - -@@ -1107,7 +1116,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -+ dc->io_disable)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 08a0b541a4da..14fce3623770 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ atomic_set(&dc->io_errors, 0); -+ dc->io_disable = false; -+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -+ - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; -@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - return flash_dev_run(c, u); - } - -+bool bch_cached_dev_error(struct cached_dev *dc) -+{ -+ char name[BDEVNAME_SIZE]; -+ -+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) -+ return false; -+ -+ dc->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ -+ pr_err("bcache: stop %s: too many IO errors on backing device %s\n", -+ dc->disk.name, bdevname(dc->bdev, name)); -+ -+ bcache_device_stop(&dc->disk); -+ return true; -+} -+ - /* Cache set */ - - __printf(2, 3) -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index afb051bcfca1..7288927f2a47 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -131,7 +131,9 @@ SHOW(__bch_cached_dev) - var_print(writeback_delay); - var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); -- -+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_error_limit, "%i", dc->error_limit); -+ sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -223,6 +225,14 @@ STORE(__cached_dev) - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ dc->io_disable = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -330,6 +340,9 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_i_term_inverse, - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_debug, -+ &sysfs_errors, -+ &sysfs_io_error_limit, -+ &sysfs_io_disable, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.15.1 - diff --git a/for-next/v3/v3-0013-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v3/v3-0013-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index 6180bd1..0000000 --- a/for-next/v3/v3-0013-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,148 +0,0 @@ -From 93be9a0e7f3112074702dd070c07818b2fe3d568 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 17:31:44 +0800 -Subject: [PATCH v3 13/13] bcache: stop bcache device when backing device is - offline - -Currently bcache does not handle backing device failure, if backing -device is offline and disconnected from system, its bcache device can still -be accessible. If the bcache device is in writeback mode, I/O requests even -can success if the requests hit on cache device. That is to say, when and -how bcache handles offline backing device is undefined. - -This patch tries to handle backing device offline in a rather simple way, -- Add cached_dev->status_update_thread kernel thread to update backing - device status in every 1 second. -- Add cached_dev->offline_seconds to record how many seconds the backing - device is observed to be offline. If the backing device is offline for - BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and - call bcache_device_stop() to stop the bache device which linked to the - offline backing device. - -Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, -its bcache device will be removed, then user space application writing on -it will get error immediately, and handler the device failure in time. - -This patch is quite simple, does not handle more complicated situations. -Once the bcache device is stopped, users need to recovery the backing -device, register and attach it manually. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 57 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5a811959392d..9eedb35d01bc 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -338,6 +338,7 @@ struct cached_dev { - - struct keybuf writeback_keys; - -+ struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of -@@ -384,6 +385,7 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 14fce3623770..85adf1e29d11 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) - c->cached_dev_sectors = sectors; - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 5 -+static int cached_dev_status_update(void *arg) -+{ -+ struct cached_dev *dc = arg; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ /* -+ * If this delayed worker is stopping outside, directly quit here. -+ * dc->io_disable might be set via sysfs interface, so check it -+ * here too. -+ */ -+ while (!kthread_should_stop() && !dc->io_disable) { -+ q = bdev_get_queue(dc->bdev); -+ if (blk_queue_dying(q)) -+ dc->offline_seconds++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), -+ BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing " -+ "device offline", dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ bcache_device_stop(&dc->disk); -+ break; -+ } -+ -+ schedule_timeout_interruptible(HZ); -+ } -+ -+ dc->status_update_thread = NULL; -+ return 0; -+} -+ - void bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; -@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -+ -+ dc->status_update_thread = kthread_run(cached_dev_status_update, -+ dc, -+ "bcache_status_update"); -+ if (IS_ERR(dc->status_update_thread)) { -+ pr_warn("bcache: failed to create bcache_status_update " -+ "kthread, continue to run without monitoring backing " -+ "device status"); -+ } - } - - /* -@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); -+ if (!IS_ERR_OR_NULL(dc->status_update_thread)) -+ kthread_stop(dc->status_update_thread); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); --- -2.15.1 - diff --git a/for-next/v4/v4-0000-cover-letter.patch b/for-next/v4/v4-0000-cover-letter.patch deleted file mode 100644 index 0327afe..0000000 --- a/for-next/v4/v4-0000-cover-letter.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 86e6c96037b81ca6d302e1e7d4342fd1decc8814 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 27 Jan 2018 20:24:53 +0800 -Subject: [PATCH v4 00/13] bcache: device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve bcache device failure handling, includes -cache device and backing device failures. - -The basic idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices which are attached to this cache set -- Stop all the detached bcache devices (configurable) -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. - -For failed backing device, there are two kinds of failures to handle, -- If device is disconnected, and kernel thread dc->status_update_thread - finds it is offline for BACKING_DEV_OFFLINE_TIMEOUT (5) seconds, the - kernel thread will set dc->io_disable and call bcache_device_stop() to - stop and remove the bcache device from system. -- If device is alive but returns too many I/O errors, after errors number - exceeds dc->error_limit, call bch_cached_dev_error() to set - dc->io_disable and stop bcache device. Then the broken backing device - and its bcache device will be removed from system. - -The v4 patch set combines two v3 patches into one, and adds one more patch -to permit users to explicitly avoid stopping attached bcache device from a -retiring cache set. This is a configurable option suggested by -Nix <nix@esperi.org.uk>. - -Some patches of this patch set is already in bcache-for-next and not -included here anymore. Most of the patches are reviewed by Hannes Reinecke -and Junhui Tang. There are still severl patches need to be reviewed, -- [PATCH v4 05/13] bcache: stop dc->writeback_rate_update properly -- [PATCH v4 13/13] bcache: add stop_attached_devs_on_fail to struct - cached_dev - -Any comment, question and review are warmly welcome. Thanks in advance. - -Changelog: -v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping - attached bcache device from a retiring cache set. -v3: fix detach issue find in v2 patch set. -v2: fixes all problems found in v1 review. - add patches to handle backing device failure. - add one more patch to set writeback_rate_update_seconds range. - include a patch from Junhui Tang. -v1: the initial version, only handles cache device failure. - -Coly Li ---- - -Coly Li (12): - bcache: set writeback_rate_update_seconds in range [1, 60] seconds - bcache: properly set task state in bch_writeback_thread() - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set - bcache: stop dc->writeback_rate_update properly - bcache: set error_limit correctly - bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags - bcache: stop all attached bcache devices for a retired cache set - bcache: add backing_request_endio() for bi_end_io of attached backing - device I/O - bcache: add io_disable to struct cached_dev - bcache: stop bcache device when backing device is offline - bcache: add stop_attached_devs_on_fail to struct cached_dev - -Tang Junhui (1): - bcache: fix inaccurate io state for detached bcache devices - - drivers/md/bcache/alloc.c | 5 +- - drivers/md/bcache/bcache.h | 38 ++++++++- - drivers/md/bcache/btree.c | 10 ++- - drivers/md/bcache/io.c | 16 +++- - drivers/md/bcache/journal.c | 4 +- - drivers/md/bcache/request.c | 187 +++++++++++++++++++++++++++++++++++------- - drivers/md/bcache/super.c | 181 ++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 55 ++++++++++++- - drivers/md/bcache/util.h | 6 -- - drivers/md/bcache/writeback.c | 99 ++++++++++++++++++---- - drivers/md/bcache/writeback.h | 5 +- - 11 files changed, 522 insertions(+), 84 deletions(-) - --- -2.15.1 - diff --git a/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch deleted file mode 100644 index 51edd0b..0000000 --- a/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 387baf9326a1abdf2005447c5c2a24f37b6681c1 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:11:03 +0800 -Subject: [PATCH v4 01/13] bcache: set writeback_rate_update_seconds in range - [1, 60] seconds - -dc->writeback_rate_update_seconds can be set via sysfs and its value can -be set to [1, ULONG_MAX]. It does not make sense to set such a large -value, 60 seconds is long enough value considering the default 5 seconds -works well for long time. - -Because dc->writeback_rate_update is a special delayed work, it re-arms -itself inside the delayed work routine update_writeback_rate(). When -stopping it by cancel_delayed_work_sync(), there should be a timeout to -wait and make sure the re-armed delayed work is stopped too. A small max -value of dc->writeback_rate_update_seconds is also helpful to decide a -reasonable small timeout. - -This patch limits sysfs interface to set dc->writeback_rate_update_seconds -in range of [1, 60] seconds, and replaces the hand-coded number by macros. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/sysfs.c | 3 +++ - drivers/md/bcache/writeback.c | 2 +- - drivers/md/bcache/writeback.h | 3 +++ - 3 files changed, 7 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b4184092c727..a74a752c9e0f 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -215,6 +215,9 @@ STORE(__cached_dev) - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); - -+ sysfs_strtoul_clamp(writeback_rate_update_seconds, -+ dc->writeback_rate_update_seconds, -+ 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 51306a19ab03..0ade883b6316 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -652,7 +652,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 8; - -- dc->writeback_rate_update_seconds = 5; -+ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 66f1c527fa24..587b25599856 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -8,6 +8,9 @@ - #define MAX_WRITEBACKS_IN_PASS 5 - #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ - -+#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 -+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 -+ - /* - * 14 (16384ths) is chosen here as something that each backing device - * should be a reasonable fraction of the share, and not to blow up --- -2.15.1 - diff --git a/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch deleted file mode 100644 index 113dd97..0000000 --- a/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch +++ /dev/null @@ -1,112 +0,0 @@ -From a979b8e27c45b69c2e1e2a5ef06257ca5fda4b66 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 26 Jan 2018 13:38:41 +0800 -Subject: [PATCH v4 02/13] bcache: properly set task state in - bch_writeback_thread() - -Kernel thread routine bch_writeback_thread() has the following code block, - -447 down_write(&dc->writeback_lock); -448~450 if (check conditions) { -451 up_write(&dc->writeback_lock); -452 set_current_state(TASK_INTERRUPTIBLE); -453 -454 if (kthread_should_stop()) -455 return 0; -456 -457 schedule(); -458 continue; -459 } - -If condition check is true, its task state is set to TASK_INTERRUPTIBLE -and call schedule() to wait for others to wake up it. - -There are 2 issues in current code, -1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if - another process changes the condition and call wake_up_process(dc-> - writeback_thread), then at line 452 task state is set back to - TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be - waken up. -2, At line 454 if kthread_should_stop() is true, writeback kernel thread - will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and - call do_exit(). It is not good to enter do_exit() with task state - TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a - warning message is reported by __might_sleep(): "WARNING: do not call - blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". - -For the first issue, task state should be set before condition checks. -Ineed because dc->writeback_lock is required when modifying all the -conditions, calling set_current_state() inside code block where dc-> -writeback_lock is hold is safe. But this is quite implicit, so I still move -set_current_state() before all the condition checks. - -For the second issue, frankley speaking it does not hurt when kernel thread -exits with TASK_INTERRUPTIBLE state, but this warning message scares users, -makes them feel there might be something risky with bcache and hurt their -data. Setting task state to TASK_RUNNING before returning fixes this -problem. - -In alloc.c:allocator_wait(), there is also a similar issue, and is also -fixed in this patch. - -Changelog: -v3: merge two similar fixes into one patch -v2: fix the race issue in v1 patch. -v1: initial buggy fix. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/alloc.c | 4 +++- - drivers/md/bcache/writeback.c | 7 +++++-- - 2 files changed, 8 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 6cc6c0f9c3a9..458e1d38577d 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,8 +287,10 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) \ -+ if (kthread_should_stop()) { \ -+ set_current_state(TASK_RUNNING); \ - return 0; \ -+ } \ - \ - schedule(); \ - mutex_lock(&(ca)->set->bucket_lock); \ -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 0ade883b6316..f1d2fc15abcc 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) - - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); -+ set_current_state(TASK_INTERRUPTIBLE); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { - up_write(&dc->writeback_lock); -- set_current_state(TASK_INTERRUPTIBLE); - -- if (kthread_should_stop()) -+ if (kthread_should_stop()) { -+ set_current_state(TASK_RUNNING); - return 0; -+ } - - schedule(); - continue; - } -+ set_current_state(TASK_RUNNING); - - searched_full_index = refill_dirty(dc); - --- -2.15.1 - diff --git a/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index f85123b..0000000 --- a/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,178 +0,0 @@ -From 15d97588692d8ddd4b1d0c628494422f33dfd537 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 23:05:58 +0800 -Subject: [PATCH v4 03/13] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - } - -+ if (atomic_read(&dc->has_dirty)) -+ cached_dev_put() -+ - return 0; - } -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Changelog: -v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 11 ++++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 133b81225ea9..d14e09cce2f6 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index f1d2fc15abcc..b280c134dd4d 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) - - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); -- return 0; -+ break; - } - - schedule(); -@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 587b25599856..0bba8f1c6cdf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.15.1 - diff --git a/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch deleted file mode 100644 index 349a3d1..0000000 --- a/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch +++ /dev/null @@ -1,83 +0,0 @@ -From f958950022560d243ae2f77c76b5063a583a625c Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 21:41:57 +0800 -Subject: [PATCH v4 04/13] bcache: quit dc->writeback_thread when - BCACHE_DEV_DETACHING is set - -In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()", -cached_dev_get() is called when creating dc->writeback_thread, and -cached_dev_put() is called when exiting dc->writeback_thread. This -modification works well unless people detach the bcache device manually by - 'echo 1 > /sys/block/bcache<N>/bcache/detach' -Because this sysfs interface only calls bch_cached_dev_detach() which wakes -up dc->writeback_thread but does not stop it. The reason is, before patch -"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside -bch_writeback_thread(), if cache is not dirty after writeback, -cached_dev_put() will be called here. And in cached_dev_make_request() when -a new write request makes cache from clean to dirty, cached_dev_get() will -be called there. Since we don't operate dc->count in these locations, -refcount d->count cannot be dropped after cache becomes clean, and -cached_dev_detach_finish() won't be called to detach bcache device. - -This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is -set inside bch_writeback_thread(). If this bit is set and cache is clean -(no existing writeback_keys), break the while-loop, call cached_dev_put() -and quit the writeback thread. - -Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the -writeback thread should continue to perform writeback, this is the original -design of manually detach. - -I compose a separte patch because that patch "bcache: fix cached_dev->count -usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes -Reinecke. Also this fix is not trivial and good for a separate patch. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Huijun Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 20 +++++++++++++++++--- - 1 file changed, 17 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index b280c134dd4d..4dbeaaa575bf 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg) - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); -- if (!atomic_read(&dc->has_dirty) || -- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -- !dc->writeback_running)) { -+ /* -+ * If the bache device is detaching, skip here and continue -+ * to perform writeback. Otherwise, if no dirty data on cache, -+ * or there is dirty data on cache but writeback is disabled, -+ * the writeback thread should sleep here and wait for others -+ * to wake up it. -+ */ -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - - if (kthread_should_stop()) { -@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg) - atomic_set(&dc->has_dirty, 0); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); -+ /* -+ * If bcache device is detaching via sysfs interface, -+ * writeback thread should stop after there is no dirty -+ * data on cache. BCACHE_DEV_DETACHING flag is set in -+ * bch_cached_dev_detach(). -+ */ -+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) -+ break; - } - - up_write(&dc->writeback_lock); --- -2.15.1 - diff --git a/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch deleted file mode 100644 index 2e6ce9b..0000000 --- a/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch +++ /dev/null @@ -1,266 +0,0 @@ -From bd0fe247c2e49cb2e19edb4bf54e8670cb315eb3 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:48:39 +0800 -Subject: [PATCH v4 05/13] bcache: stop dc->writeback_rate_update properly - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, this delayed work is -stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -itself. That means when cancel_delayed_work_sync() returns, this delayed -work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will continue and release memory of struct cache set. -Then the delayed work is scheduled to run, __update_writeback_rate() -will reference the already released cache_set memory, and trigger a NULL -pointer deference fault. - -This patch introduces two more bcache device flags, -- BCACHE_DEV_WB_RUNNING - bit set: bcache device is in writeback mode and running, it is OK for - dc->writeback_rate_update to re-arm itself. - bit clear:bcache device is trying to stop dc->writeback_rate_update, - this delayed work should not re-arm itself and quit. -- BCACHE_DEV_RATE_DW_RUNNING - bit set: routine update_writeback_rate() is executing. - bit clear: routine update_writeback_rate() quits. - -This patch also adds a function cancel_writeback_rate_update_dwork() to -wait for dc->writeback_rate_update quits before cancel it by calling -cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected -quit dc->writeback_rate_update, after time_out seconds this function will -give up and continue to call cancel_delayed_work_sync(). - -And here I explain how this patch stops self re-armed delayed work properly -with the above stuffs. - -update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning -and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling -cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. - -Before calling cancel_delayed_work_sync() wait utill flag -BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling -cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- -armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases -delayed work routine update_writeback_rate() won't be executed after -cancel_delayed_work_sync() returns. - -Inside update_writeback_rate() before calling schedule_delayed_work(), flag -BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means -someone is about to stop the delayed work. Because flag -BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() -has to wait for this flag to be cleared, we don't need to worry about race -condition here. - -If update_writeback_rate() is scheduled to run after checking -BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() -in cancel_writeback_rate_update_dwork(), it is also safe. Because at this -moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned -previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear -and quit immediately. - -Because there are more dependences inside update_writeback_rate() to struct -cache_set memory, dc->writeback_rate_update is not a simple self re-arm -delayed work. After trying many different methods (e.g. hold dc->count, or -use locks), this is the only way I can find which works to properly stop -dc->writeback_rate_update delayed work. - -Changelog: -v2: Try to fix the race issue which is pointed out by Junhui. -v1: The initial version for review - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 9 +++++---- - drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 3 ++- - drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- - 4 files changed, 70 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5e2d4e80198e..88d938c8d027 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -258,10 +258,11 @@ struct bcache_device { - struct gendisk *disk; - - unsigned long flags; --#define BCACHE_DEV_CLOSING 0 --#define BCACHE_DEV_DETACHING 1 --#define BCACHE_DEV_UNLINK_DONE 2 -- -+#define BCACHE_DEV_CLOSING 0 -+#define BCACHE_DEV_DETACHING 1 -+#define BCACHE_DEV_UNLINK_DONE 2 -+#define BCACHE_DEV_WB_RUNNING 4 -+#define BCACHE_DEV_RATE_DW_RUNNING 8 - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index d14e09cce2f6..6d888e8fea8c 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) - pr_debug("error creating sysfs link"); - } - -+/* -+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed -+ * work dc->writeback_rate_update is running. Wait until the routine -+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to -+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out -+ * seconds, give up waiting here and continue to cancel it too. -+ */ -+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) -+{ -+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; -+ -+ do { -+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, -+ &dc->disk.flags)) -+ break; -+ time_out--; -+ schedule_timeout_interruptible(1); -+ } while (time_out > 0); -+ -+ if (time_out == 0) -+ pr_warn("bcache: give up waiting for " -+ "dc->writeback_write_update to quit"); -+ -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+} -+ - static void cached_dev_detach_finish(struct work_struct *w) - { - struct cached_dev *dc = container_of(w, struct cached_dev, detach); -@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) - - mutex_lock(&bch_register_lock); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) - closure_get(&dc->disk.cl); - - bch_writeback_queue(dc); -+ - cached_dev_put(dc); - } - -@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ mutex_lock(&bch_register_lock); -+ -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -- mutex_lock(&bch_register_lock); -- - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index a74a752c9e0f..b7166c504cdb 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -304,7 +304,8 @@ STORE(bch_cached_dev) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 4dbeaaa575bf..8f98ef1038d3 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev, - writeback_rate_update); - -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ return; -+ } -+ - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && -@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -+ } -+ -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); - } - - static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) -@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - } - -@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - return PTR_ERR(dc->writeback_thread); - } - -+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - --- -2.15.1 - diff --git a/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch deleted file mode 100644 index 927468d..0000000 --- a/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch +++ /dev/null @@ -1,121 +0,0 @@ -From f259f50b81b23abcd79f8e20ba479c61ef67d983 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 9 Jan 2018 22:46:25 +0800 -Subject: [PATCH v4 06/13] bcache: set error_limit correctly - -Struct cache uses io_errors for two purposes, -- Error decay: when cache set error_decay is set, io_errors is used to - generate a small piece of delay when I/O error happens. -- I/O errors counter: in order to generate big enough value for error - decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a - IO_ERROR_SHIFT). - -In function bch_count_io_errors(), if I/O errors counter reaches cache set -error limit, bch_cache_set_error() will be called to retire the whold cache -set. But current code is problematic when checking the error limit, see the -following code piece from bch_count_io_errors(), - - 90 if (error) { - 91 char buf[BDEVNAME_SIZE]; - 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, - 93 &ca->io_errors); - 94 errors >>= IO_ERROR_SHIFT; - 95 - 96 if (errors < ca->set->error_limit) - 97 pr_err("%s: IO error on %s, recovering", - 98 bdevname(ca->bdev, buf), m); - 99 else -100 bch_cache_set_error(ca->set, -101 "%s: too many IO errors %s", -102 bdevname(ca->bdev, buf), m); -103 } - -At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real -errors counter to compare at line 96. But ca->set->error_limit is initia- -lized with an amplified value in bch_cache_set_alloc(), -1545 c->error_limit = 8 << IO_ERROR_SHIFT; - -It means by default, in bch_count_io_errors(), before 8<<20 errors happened -bch_cache_set_error() won't be called to retire the problematic cache -device. If the average request size is 64KB, it means bcache won't handle -failed device until 512GB data is requested. This is too large to be an I/O -threashold. So I believe the correct error limit should be much less. - -This patch sets default cache set error limit to 8, then in -bch_count_io_errors() when errors counter reaches 8 (if it is default -value), function bch_cache_set_error() will be called to retire the whole -cache set. This patch also removes bits shifting when store or show -io_error_limit value via sysfs interface. - -Nowadays most of SSDs handle internal flash failure automatically by LBA -address re-indirect mapping. If an I/O error can be observed by upper layer -code, it will be a notable error because that SSD can not re-indirect -map the problematic LBA address to an available flash block. This situation -indicates the whole SSD will be failed very soon. Therefore setting 8 as -the default io error limit value makes sense, it is enough for most of -cache devices. - -Changelog: -v2: add reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 2 +- - drivers/md/bcache/sysfs.c | 4 ++-- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 88d938c8d027..7d7512fa4f09 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -663,6 +663,7 @@ struct cache_set { - ON_ERROR_UNREGISTER, - ON_ERROR_PANIC, - } on_error; -+#define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 6d888e8fea8c..a373648b5d4b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1583,7 +1583,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; -- c->error_limit = 8 << IO_ERROR_SHIFT; -+ c->error_limit = DEFAULT_IO_ERROR_LIMIT; - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index b7166c504cdb..ba62e987b503 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -560,7 +560,7 @@ SHOW(__bch_cache_set) - - /* See count_io_errors for why 88 */ - sysfs_print(io_error_halflife, c->error_decay * 88); -- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); -+ sysfs_print(io_error_limit, c->error_limit); - - sysfs_hprint(congested, - ((uint64_t) bch_get_congested(c)) << 9); -@@ -660,7 +660,7 @@ STORE(__bch_cache_set) - } - - if (attr == &sysfs_io_error_limit) -- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; -+ c->error_limit = strtoul_or_return(buf); - - /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) --- -2.15.1 - diff --git a/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch deleted file mode 100644 index 849d522..0000000 --- a/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ /dev/null @@ -1,489 +0,0 @@ -From a7c1f04212502a6e1505bfc0917809363d988660 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 22:15:00 +0800 -Subject: [PATCH v4 07/13] bcache: add CACHE_SET_IO_DISABLE to struct cache_set - flags - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when flags bit -CACHE_SET_IO_DISABLE is set. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, -closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit -from cache_set->flags, to disable or enable cache set I/O for debugging. It -is helpful to trigger more corner case issues for failed cache device. - -Changelog -v2, more changes by previous review, -- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. -- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this - is reported and inspired from origal patch of Pavel Vazharov. -v1, initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Pavel Vazharov <freakpv@gmail.com> ---- - drivers/md/bcache/alloc.c | 3 ++- - drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ - drivers/md/bcache/btree.c | 10 +++++++--- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 6 +++++- - drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- - 10 files changed, 101 insertions(+), 29 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 458e1d38577d..004cc3cc6123 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,7 +287,8 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) { \ -+ if (kthread_should_stop() || \ -+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ - set_current_state(TASK_RUNNING); \ - return 0; \ - } \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 7d7512fa4f09..c41736960045 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -475,10 +475,15 @@ struct gc_stat { - * - * CACHE_SET_RUNNING means all cache devices have been registered and journal - * replay is complete. -+ * -+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all -+ * external and internal I/O should be denied when this flag is set. -+ * - */ - #define CACHE_SET_UNREGISTERING 0 - #define CACHE_SET_STOPPING 1 - #define CACHE_SET_RUNNING 2 -+#define CACHE_SET_IO_DISABLE 4 - - struct cache_set { - struct closure cl; -@@ -862,6 +867,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index bf3a48aa9a9a..0a0bc63011b4 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) - - btree_gc_start(c); - -+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ - do { - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); -@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) - - if (ret && ret != -EAGAIN) - pr_warn("gc failed!"); -- } while (ret); -+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - bch_btree_gc_finish(c); - wake_up_allocators(c); -@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a87165c1d8e5..979873641030 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -653,7 +653,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 1a46b41dac70..02296bda6384 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - -@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a373648b5d4b..4204d75aee7b 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - test_bit(CACHE_SET_STOPPING, &c->flags)) - return false; - -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE already set"); -+ - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ -@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index ba62e987b503..afb051bcfca1 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -92,6 +92,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -577,6 +578,8 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -666,6 +669,22 @@ STORE(__bch_cache_set) - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; - -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ if (v) { -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already set"); -+ } else { -+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("bcache: CACHE_SET_IO_DISABLE" -+ " already cleared"); -+ } -+ } -+ - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); -@@ -748,6 +767,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 4df4c5c1cab2..7944eea54fa9 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -565,12 +565,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 8f98ef1038d3..3d7d8452e0de 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; - - /* - * should check BCACHE_DEV_RATE_DW_RUNNING before calling -@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); - -- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); -@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - atomic_set(&dc->writeback_sequence_next, next_sequence); -@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) - - next = bch_keybuf_next(&dc->writeback_keys); - -- while (!kthread_should_stop() && next) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ next) { - size = 0; - nk = 0; - -@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) - } - } - -- while (!kthread_should_stop() && delay) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ delay) { - schedule_timeout_interruptible(delay); - delay = writeback_delay(dc, 0); - } -@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - /* -@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg) - (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) { -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - set_current_state(TASK_RUNNING); - break; - } -@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg) - - while (delay && - !kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - --- -2.15.1 - diff --git a/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch deleted file mode 100644 index eab5e76..0000000 --- a/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ /dev/null @@ -1,67 +0,0 @@ -From 86e6ce9e732449701c0d00048b5a07c140bd2ee5 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 00:26:32 +0800 -Subject: [PATCH v4 08/13] bcache: stop all attached bcache devices for a - retired cache set - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -This patch checkes whether cache_set->io_disable is true in -__cache_set_unregister(). If cache_set->io_disable is true, it means cache -set is unregistering by too many I/O errors, then all attached bcache -devices will be stopped as well. If cache_set->io_disable is not true, it -means __cache_set_unregister() is triggered by writing 1 to sysfs file -/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do -it explicitly, this patch keeps existing behavior and does not stop any -bcache device. - -Even the failed cache device has no dirty data, stopping bcache device is -still a desired behavior by many Ceph and data base users. Then their -application will report I/O errors due to disappeared bcache device, and -operation people will know the cache device is broken or disconnected. - -Changelog: -v2: add reviewed-by from Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 4204d75aee7b..97e3bb8e1aee 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl) - dc = container_of(c->devices[i], - struct cached_dev, disk); - bch_cached_dev_detach(dc); -+ /* -+ * If we come here by too many I/O errors, -+ * bcache device should be stopped too, to -+ * keep data consistency on cache and -+ * backing devices. -+ */ -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ bcache_device_stop(c->devices[i]); - } else { - bcache_device_stop(c->devices[i]); - } --- -2.15.1 - diff --git a/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch deleted file mode 100644 index 048a30a..0000000 --- a/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ /dev/null @@ -1,119 +0,0 @@ -From 4d6a58a04771b787578862bae770e69eee1b358e Mon Sep 17 00:00:00 2001 -From: Tang Junhui <tang.junhui@zte.com.cn> -Date: Tue, 9 Jan 2018 10:27:11 +0800 -Subject: [PATCH v4 09/13] bcache: fix inaccurate io state for detached bcache - devices - -When we run IO in a detached device, and run iostat to shows IO status, -normally it will show like bellow (Omitted some fields): -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 -bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 -but after IO stopped, there are still very big avgqu-sz and %util -values as bellow: -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 - -The reason for this issue is that, only generic_start_io_acct() called -and no generic_end_io_acct() called for detached device in -cached_dev_make_request(). See the code: -//start generic_start_io_acct() -generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); -if (cached_dev_get(dc)) { - //will callback generic_end_io_acct() -} -else { - //will not call generic_end_io_acct() -} - -This patch calls generic_end_io_acct() in the end of IO for detached -devices, so we can show IO state correctly. - -(Modified to use GFP_NOIO in kzalloc() by Coly Li) - -Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> -Reviewed-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 51 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 02296bda6384..e09c5ae745be 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) - continue_at(cl, cached_dev_bio_complete, NULL); - } - -+struct detached_dev_io_private { -+ struct bcache_device *d; -+ unsigned long start_time; -+ bio_end_io_t *bi_end_io; -+ void *bi_private; -+}; -+ -+static void detatched_dev_end_io(struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ -+ ddip = bio->bi_private; -+ bio->bi_end_io = ddip->bi_end_io; -+ bio->bi_private = ddip->bi_private; -+ -+ generic_end_io_acct(ddip->d->disk->queue, -+ bio_data_dir(bio), -+ &ddip->d->disk->part0, ddip->start_time); -+ -+ kfree(ddip); -+ -+ bio->bi_end_io(bio); -+} -+ -+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ /* -+ * no need to call closure_get(&dc->disk.cl), -+ * because upper layer had already opened bcache device, -+ * which would call closure_get(&dc->disk.cl) -+ */ -+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); -+ ddip->d = d; -+ ddip->start_time = jiffies; -+ ddip->bi_end_io = bio->bi_end_io; -+ ddip->bi_private = bio->bi_private; -+ bio->bi_end_io = detatched_dev_end_io; -+ bio->bi_private = ddip; -+ -+ if ((bio_op(bio) == REQ_OP_DISCARD) && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ bio->bi_end_io(bio); -+ else -+ generic_make_request(bio); -+} -+ - /* Cached devices - read & write stuff */ - - static blk_qc_t cached_dev_make_request(struct request_queue *q, -@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - else - cached_dev_read(dc, s); - } -- } else { -- if ((bio_op(bio) == REQ_OP_DISCARD) && -- !blk_queue_discard(bdev_get_queue(dc->bdev))) -- bio_endio(bio); -- else -- generic_make_request(bio); -- } -+ } else -+ detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; - } --- -2.15.1 - diff --git a/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch deleted file mode 100644 index 80f6dc8..0000000 --- a/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ /dev/null @@ -1,255 +0,0 @@ -From 1e8e6958888300f4b50ccc6798d4ce17b0e92afe Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:01:48 +0800 -Subject: [PATCH v4 10/13] bcache: add backing_request_endio() for bi_end_io of - attached backing device I/O - -In order to catch I/O error of backing device, a separate bi_end_io -call back is required. Then a per backing device counter can record I/O -errors number and retire the backing device if the counter reaches a -per backing device I/O error limit. - -This patch adds backing_request_endio() to bcache backing device I/O code -path, this is a preparation for further complicated backing device failure -handling. So far there is no real code logic change, I make this change a -separate patch to make sure it is stable and reliable for further work. - -Changelog: -v2: indeed this is new added in this patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 95 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/super.c | 1 + - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 81 insertions(+), 16 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index e09c5ae745be..ad4cf71f7eab 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) - } - - op->insert_data_done = true; -+ /* get in bch_data_insert() */ - bio_put(bio); - out: - continue_at(cl, bch_data_insert_keys, op->wq); -@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) - closure_put(cl); - } - -+static void backing_request_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ -+ if (bio->bi_status) { -+ struct search *s = container_of(cl, struct search, cl); -+ /* -+ * If a bio has REQ_PREFLUSH for writeback mode, it is -+ * speically assembled in cached_dev_write() for a non-zero -+ * write request which has REQ_PREFLUSH. we don't set -+ * s->iop.status by this failure, the status will be decided -+ * by result of bch_data_insert() operation. -+ */ -+ if (unlikely(s->iop.writeback && -+ bio->bi_opf & REQ_PREFLUSH)) { -+ char buf[BDEVNAME_SIZE]; -+ -+ bio_devname(bio, buf); -+ pr_err("Can't flush %s: returned bi_status %i", -+ buf, bio->bi_status); -+ } else { -+ /* set to orig_bio->bi_status in bio_complete() */ -+ s->iop.status = bio->bi_status; -+ } -+ s->recoverable = false; -+ /* should count I/O error for backing device here */ -+ } -+ -+ bio_put(bio); -+ closure_put(cl); -+} -+ - static void bio_complete(struct search *s) - { - if (s->orig_bio) { -@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) - } - } - --static void do_bio_hook(struct search *s, struct bio *orig_bio) -+static void do_bio_hook(struct search *s, -+ struct bio *orig_bio, -+ bio_end_io_t *end_io_fn) - { - struct bio *bio = &s->bio.bio; - - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); -- bio->bi_end_io = request_endio; -+ /* -+ * bi_end_io can be set separately somewhere else, e.g. the -+ * variants in, -+ * - cache_bio->bi_end_io from cached_dev_cache_miss() -+ * - n->bi_end_io from cache_lookup_fn() -+ */ -+ bio->bi_end_io = end_io_fn; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, - s = mempool_alloc(d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); -- do_bio_hook(s, bio); -+ do_bio_hook(s, bio, request_endio); - - s->orig_bio = bio; - s->cache_miss = NULL; -@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) - trace_bcache_read_retry(s->orig_bio); - - s->iop.status = 0; -- do_bio_hook(s, s->orig_bio); -+ do_bio_hook(s, s->orig_bio, backing_request_endio); - - /* XXX: invalidate cache */ - -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, bio, cl); - } - -@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - bio_copy_dev(cache_bio, miss); - cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - -- cache_bio->bi_end_io = request_endio; -+ cache_bio->bi_end_io = backing_request_endio; - cache_bio->bi_private = &s->cl; - - bch_bio_map(cache_bio, NULL); -@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: - bio_put(cache_bio); - out_submit: -- miss->bi_end_io = request_endio; -+ miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } -@@ -943,31 +987,48 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - s->iop.bio = s->orig_bio; - bio_get(s->iop.bio); - -- if ((bio_op(bio) != REQ_OP_DISCARD) || -- blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(s->iop.c, bio, cl); -+ if (bio_op(bio) == REQ_OP_DISCARD && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ goto insert_data; -+ -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; -+ closure_bio_submit(s->iop.c, bio, cl); -+ - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; - - if (bio->bi_opf & REQ_PREFLUSH) { -- /* Also need to send a flush to the backing device */ -- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, -- dc->disk.bio_split); -- -+ /* -+ * Also need to send a flush to the backing -+ * device, if failed on backing device. -+ */ -+ struct bio *flush; -+ -+ flush = bio_alloc_bioset(GFP_NOIO, 0, -+ dc->disk.bio_split); -+ if (!flush) { -+ s->iop.status = BLK_STS_RESOURCE; -+ goto insert_data; -+ } - bio_copy_dev(flush, bio); -- flush->bi_end_io = request_endio; -+ flush->bi_end_io = backing_request_endio; - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, flush, cl); - } -+ bch_writeback_add(dc); -+ - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); -- -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - } - -+insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); - } -@@ -981,6 +1042,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1078,6 +1140,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - cached_dev_read(dc, s); - } - } else -+ /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 97e3bb8e1aee..08a0b541a4da 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) - bio->bi_private = dc; - - closure_get(cl); -+ /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 3d7d8452e0de..4ebe0119ea7e 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -+ /* I/O request sent to backing device */ - closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - --- -2.15.1 - diff --git a/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch deleted file mode 100644 index 6b4ae2a..0000000 --- a/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 63d3df27ffc3a82d15f3f7f428194988f410197a Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:33:45 +0800 -Subject: [PATCH v4 11/13] bcache: add io_disable to struct cached_dev - -If a bcache device is configured to writeback mode, current code does not -handle write I/O errors on backing devices properly. - -In writeback mode, write request is written to cache device, and -latter being flushed to backing device. If I/O failed when writing from -cache device to the backing device, bcache code just ignores the error and -upper layer code is NOT noticed that the backing device is broken. - -This patch tries to handle backing device failure like how the cache device -failure is handled, -- Add a error counter 'io_errors' and error limit 'error_limit' in struct - cached_dev. Add another io_disable to struct cached_dev to disable I/Os - on the problematic backing device. -- When I/O error happens on backing device, increase io_errors counter. And - if io_errors reaches error_limit, set cache_dev->io_disable to true, and - stop the bcache device. - -The result is, if backing device is broken of disconnected, and I/O errors -reach its error limit, backing device will be disabled and the associated -bcache device will be removed from system. - -Changelog: -v2: indeed this is new added in v2 patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 7 +++++++ - drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 14 ++++++++++++-- - drivers/md/bcache/super.c | 22 ++++++++++++++++++++++ - drivers/md/bcache/sysfs.c | 15 ++++++++++++++- - 5 files changed, 69 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c41736960045..5a811959392d 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -360,6 +360,7 @@ struct cached_dev { - unsigned sequential_cutoff; - unsigned readahead; - -+ unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -@@ -379,6 +380,10 @@ struct cached_dev { - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; -+ -+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 -+ atomic_t io_errors; -+ unsigned error_limit; - }; - - enum alloc_reserve { -@@ -882,6 +887,7 @@ static inline void closure_bio_submit(struct cache_set *c, - - /* Forward declarations */ - -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -@@ -909,6 +915,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); - bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); -+bool bch_cached_dev_error(struct cached_dev *dc); - - __printf(2, 3) - bool bch_cache_set_error(struct cache_set *, const char *, ...); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index 8013ecbcdbda..7fac97ae036e 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - } - - /* IO errors */ -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) -+{ -+ char buf[BDEVNAME_SIZE]; -+ unsigned errors; -+ -+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); -+ -+ errors = atomic_add_return(1, &dc->io_errors); -+ if (errors < dc->error_limit) -+ pr_err("%s: IO error on backing device, unrecoverable", -+ bio_devname(bio, buf)); -+ else -+ bch_cached_dev_error(dc); -+} - - void bch_count_io_errors(struct cache *ca, - blk_status_t error, -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index ad4cf71f7eab..386b388ce296 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) - - if (bio->bi_status) { - struct search *s = container_of(cl, struct search, cl); -+ struct cached_dev *dc = container_of(s->d, -+ struct cached_dev, disk); - /* - * If a bio has REQ_PREFLUSH for writeback mode, it is - * speically assembled in cached_dev_write() for a non-zero -@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) - } - s->recoverable = false; - /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); - } - - bio_put(bio); -@@ -1067,8 +1070,14 @@ static void detatched_dev_end_io(struct bio *bio) - bio_data_dir(bio), - &ddip->d->disk->part0, ddip->start_time); - -- kfree(ddip); -+ if (bio->bi_status) { -+ struct cached_dev *dc = container_of(ddip->d, -+ struct cached_dev, disk); -+ /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); -+ } - -+ kfree(ddip); - bio->bi_end_io(bio); - } - -@@ -1107,7 +1116,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -+ dc->io_disable)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 08a0b541a4da..14fce3623770 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ atomic_set(&dc->io_errors, 0); -+ dc->io_disable = false; -+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -+ - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; -@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - return flash_dev_run(c, u); - } - -+bool bch_cached_dev_error(struct cached_dev *dc) -+{ -+ char name[BDEVNAME_SIZE]; -+ -+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) -+ return false; -+ -+ dc->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ -+ pr_err("bcache: stop %s: too many IO errors on backing device %s\n", -+ dc->disk.name, bdevname(dc->bdev, name)); -+ -+ bcache_device_stop(&dc->disk); -+ return true; -+} -+ - /* Cache set */ - - __printf(2, 3) -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index afb051bcfca1..7288927f2a47 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -131,7 +131,9 @@ SHOW(__bch_cached_dev) - var_print(writeback_delay); - var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); -- -+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_error_limit, "%i", dc->error_limit); -+ sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -223,6 +225,14 @@ STORE(__cached_dev) - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ dc->io_disable = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -330,6 +340,9 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_i_term_inverse, - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_debug, -+ &sysfs_errors, -+ &sysfs_io_error_limit, -+ &sysfs_io_disable, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.15.1 - diff --git a/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index e73bf4f..0000000 --- a/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,148 +0,0 @@ -From fd9bb15c3ac093f087401ed275184e2a54eadbb6 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 17:31:44 +0800 -Subject: [PATCH v4 12/13] bcache: stop bcache device when backing device is - offline - -Currently bcache does not handle backing device failure, if backing -device is offline and disconnected from system, its bcache device can still -be accessible. If the bcache device is in writeback mode, I/O requests even -can success if the requests hit on cache device. That is to say, when and -how bcache handles offline backing device is undefined. - -This patch tries to handle backing device offline in a rather simple way, -- Add cached_dev->status_update_thread kernel thread to update backing - device status in every 1 second. -- Add cached_dev->offline_seconds to record how many seconds the backing - device is observed to be offline. If the backing device is offline for - BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and - call bcache_device_stop() to stop the bache device which linked to the - offline backing device. - -Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, -its bcache device will be removed, then user space application writing on -it will get error immediately, and handler the device failure in time. - -This patch is quite simple, does not handle more complicated situations. -Once the bcache device is stopped, users need to recovery the backing -device, register and attach it manually. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 57 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5a811959392d..9eedb35d01bc 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -338,6 +338,7 @@ struct cached_dev { - - struct keybuf writeback_keys; - -+ struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of -@@ -384,6 +385,7 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 14fce3623770..85adf1e29d11 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) - c->cached_dev_sectors = sectors; - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 5 -+static int cached_dev_status_update(void *arg) -+{ -+ struct cached_dev *dc = arg; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ /* -+ * If this delayed worker is stopping outside, directly quit here. -+ * dc->io_disable might be set via sysfs interface, so check it -+ * here too. -+ */ -+ while (!kthread_should_stop() && !dc->io_disable) { -+ q = bdev_get_queue(dc->bdev); -+ if (blk_queue_dying(q)) -+ dc->offline_seconds++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), -+ BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing " -+ "device offline", dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ bcache_device_stop(&dc->disk); -+ break; -+ } -+ -+ schedule_timeout_interruptible(HZ); -+ } -+ -+ dc->status_update_thread = NULL; -+ return 0; -+} -+ - void bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; -@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -+ -+ dc->status_update_thread = kthread_run(cached_dev_status_update, -+ dc, -+ "bcache_status_update"); -+ if (IS_ERR(dc->status_update_thread)) { -+ pr_warn("bcache: failed to create bcache_status_update " -+ "kthread, continue to run without monitoring backing " -+ "device status"); -+ } - } - - /* -@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); -+ if (!IS_ERR_OR_NULL(dc->status_update_thread)) -+ kthread_stop(dc->status_update_thread); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); --- -2.15.1 - diff --git a/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch deleted file mode 100644 index d9edf10..0000000 --- a/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch +++ /dev/null @@ -1,180 +0,0 @@ -From 86e6c96037b81ca6d302e1e7d4342fd1decc8814 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 27 Jan 2018 20:06:15 +0800 -Subject: [PATCH v4 13/13] bcache: add stop_when_cache_set_failed to struct - cached_dev - -Current bcache failure handling code will stop all attached bcache devices -when the cache set is broken or disconnected. This is desired behavior for -most of enterprise or cloud use cases, but maybe not for low end -configuration. Nix <nix@esperi.org.uk> points out, users may still want to -access the bcache device after cache device failed, for example on laptops. - -This patch adds a per-cached_dev option stop_when_cache_set_failed, which -is enabled (1) by default. Its value can be set via sysfs, when it is set -to 0, the corresponding bcache device won't be stopped when a broken -or disconnected cache set is retiring. - -When the cached device has dirty data on retiring cache set, if bcache -device is not stopped, following I/O request on the bcache device may -result data corruption on backing device. This patch also prints out warn- -ing information in kernel message. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Nix <nix@esperi.org.uk> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/super.c | 63 +++++++++++++++++++++++++++++++++------------- - drivers/md/bcache/sysfs.c | 10 ++++++++ - 3 files changed, 56 insertions(+), 18 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 9eedb35d01bc..3756a196916f 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -362,6 +362,7 @@ struct cached_dev { - unsigned readahead; - - unsigned io_disable:1; -+ unsigned stop_when_cache_set_failed:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 85adf1e29d11..93f720433b40 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1246,6 +1246,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - atomic_set(&dc->io_errors, 0); - dc->io_disable = false; - dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -+ dc->stop_when_cache_set_failed = 1; - - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); -@@ -1541,33 +1542,59 @@ static void cache_set_flush(struct closure *cl) - closure_return(cl); - } - -+/* -+ * dc->stop_when_cache_set_failed is default to true. If it is explicitly -+ * set to false by user, the bcache device won't be stopped when cache set -+ * is broken or disconnected. If there is dirty data on failed cache set, -+ * not stopping bcache device may result data corruption on backing device, -+ * pr_warn() notices the protential risk in kernel message. -+ */ -+static void try_stop_bcache_device(struct cache_set *c, -+ struct bcache_device *d, -+ struct cached_dev *dc) -+{ -+ if (dc->stop_when_cache_set_failed) -+ bcache_device_stop(d); -+ else if (!dc->stop_when_cache_set_failed && -+ atomic_read(&dc->has_dirty)) -+ pr_warn("bcache: device %s won't be stopped while unregistering" -+ " broken dirty cache set %pU, your data has potential " -+ "risk to be corrupted. To disable this warning message," -+ " please set /sys/block/%s/bcache/stop_when_" -+ "cache_set_failed to 1.", -+ d->name, c->sb.set_uuid, d->name); -+} -+ - static void __cache_set_unregister(struct closure *cl) - { - struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc; -+ struct bcache_device *d; - size_t i; - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->devices_max_used; i++) -- if (c->devices[i]) { -- if (!UUID_FLASH_ONLY(&c->uuids[i]) && -- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -- dc = container_of(c->devices[i], -- struct cached_dev, disk); -- bch_cached_dev_detach(dc); -- /* -- * If we come here by too many I/O errors, -- * bcache device should be stopped too, to -- * keep data consistency on cache and -- * backing devices. -- */ -- if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -- bcache_device_stop(c->devices[i]); -- } else { -- bcache_device_stop(c->devices[i]); -- } -+ for (i = 0; i < c->devices_max_used; i++) { -+ d = c->devices[i]; -+ if (!d) -+ continue; -+ -+ if (!UUID_FLASH_ONLY(&c->uuids[i]) && -+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -+ dc = container_of(d, struct cached_dev, disk); -+ bch_cached_dev_detach(dc); -+ /* -+ * If we come here by too many I/O errors, -+ * bcache device should be stopped too, to -+ * keep data consistency on cache and -+ * backing devices. -+ */ -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ try_stop_bcache_device(c, d, dc); -+ } else { -+ bcache_device_stop(d); - } -+ } - - mutex_unlock(&bch_register_lock); - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 7288927f2a47..b096d4c37c9b 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -93,6 +93,7 @@ read_attribute(partial_stripes_expensive); - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); - rw_attribute(io_disable); -+rw_attribute(stop_when_cache_set_failed); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -134,6 +135,8 @@ SHOW(__bch_cached_dev) - sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); - sysfs_printf(io_error_limit, "%i", dc->error_limit); - sysfs_printf(io_disable, "%i", dc->io_disable); -+ sysfs_printf(stop_when_cache_set_failed, "%i", -+ dc->stop_when_cache_set_failed); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -233,6 +236,12 @@ STORE(__cached_dev) - dc->io_disable = v ? 1 : 0; - } - -+ if (attr == &sysfs_stop_when_cache_set_failed) { -+ int v = strtoul_or_return(buf); -+ -+ dc->stop_when_cache_set_failed = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -343,6 +352,7 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_errors, - &sysfs_io_error_limit, - &sysfs_io_disable, -+ &sysfs_stop_when_cache_set_failed, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.15.1 - diff --git a/for-next/v5/v5-0000-cover-letter.patch b/for-next/v5/v5-0000-cover-letter.patch deleted file mode 100644 index f643463..0000000 --- a/for-next/v5/v5-0000-cover-letter.patch +++ /dev/null @@ -1,95 +0,0 @@ -From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 18:26:45 +0800 -Subject: [PATCH v5 00/10] bcache: device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve bcache device failure handling, includes -cache device and backing device failures. - -The basic idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices which are attached to this cache set -- Stop all the detached bcache devices (configurable) -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. -- Stop all the detached bcache devices (configurable) -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed -(configurable), following I/O requests will get failed immediately to -notify upper layer or user space coce that the cache device is failed or -disconnected. - -There are 2 patches from v4 patch set is merged into bcache-for-next, they -are not in v5 patch set any more. - -V5 patch set adds a new patch "bcache: add stop_when_cache_set_failed -option to backing device", which provides "auto"/"always" options to -configure whether or not to stop bcache device for a broken cache set. The -patch "bcache: stop all attached bcache devices for a retired cache set" -from v4 patch set is replaced by the above new added patch. - -Most of the patches are reviewed by Hannes Reinecke and Junhui Tang. There -are still severl patches need to be reviewed, -- [PATCH v5 03/10] bcache: quit dc->writeback_thread when - BCACHE_DEV_DETACHING is set -- [PATCH v5 06/10] bcache: add stop_when_cache_set_failed option to - backing device - -Any comment, question and review are warmly welcome. Thanks in advance. - -Changelog: -v5: replace patch "bcache: stop all attached bcache devices for a retired - cache set" from v4 patch set by "bcache: add stop_when_cache_set_failed - option to backing device" from v5 patch set. - fix issues from v4 patch set. - improve kernel message format, remove redundant prefix string. -v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping - attached bcache device from a retiring cache set. -v3: fix detach issue find in v2 patch set. -v2: fixes all problems found in v1 review. - add patches to handle backing device failure. - add one more patch to set writeback_rate_update_seconds range. - include a patch from Junhui Tang. -v1: the initial version, only handles cache device failure. - -Coly Li - - -Coly Li (10): - bcache: set writeback_rate_update_seconds in range [1, 60] seconds - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set - bcache: stop dc->writeback_rate_update properly - bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags - bcache: stop all attached bcache devices for a retired cache set - bcache: add backing_request_endio() for bi_end_io of attached backing - device I/O - bcache: add io_disable to struct cached_dev - bcache: stop bcache device when backing device is offline - bcache: add stop_when_cache_set_failed option to backing device - -Tang Junhui (1): - bcache: fix inaccurate io state for detached bcache devices - - drivers/md/bcache/alloc.c | 3 +- - drivers/md/bcache/bcache.h | 44 ++++++++- - drivers/md/bcache/btree.c | 10 +- - drivers/md/bcache/io.c | 16 +++- - drivers/md/bcache/journal.c | 4 +- - drivers/md/bcache/request.c | 185 +++++++++++++++++++++++++++++++------ - drivers/md/bcache/super.c | 206 ++++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 59 +++++++++++- - drivers/md/bcache/util.h | 6 -- - drivers/md/bcache/writeback.c | 94 ++++++++++++++++--- - drivers/md/bcache/writeback.h | 5 +- - 11 files changed, 551 insertions(+), 81 deletions(-) - --- -2.16.1 - diff --git a/for-next/v5/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v5/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch deleted file mode 100644 index 4a6c147..0000000 --- a/for-next/v5/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch +++ /dev/null @@ -1,79 +0,0 @@ -From 71066c410c4f50bb1803a634dff17fd0ecb90860 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:11:03 +0800 -Subject: [PATCH v5 01/10] bcache: set writeback_rate_update_seconds in range - [1, 60] seconds - -dc->writeback_rate_update_seconds can be set via sysfs and its value can -be set to [1, ULONG_MAX]. It does not make sense to set such a large -value, 60 seconds is long enough value considering the default 5 seconds -works well for long time. - -Because dc->writeback_rate_update is a special delayed work, it re-arms -itself inside the delayed work routine update_writeback_rate(). When -stopping it by cancel_delayed_work_sync(), there should be a timeout to -wait and make sure the re-armed delayed work is stopped too. A small max -value of dc->writeback_rate_update_seconds is also helpful to decide a -reasonable small timeout. - -This patch limits sysfs interface to set dc->writeback_rate_update_seconds -in range of [1, 60] seconds, and replaces the hand-coded number by macros. - -Changelog: -v2: fix a rebase typo in v4, which is pointed out by Michael Lyle. -v1: initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/sysfs.c | 4 +++- - drivers/md/bcache/writeback.c | 2 +- - drivers/md/bcache/writeback.h | 3 +++ - 3 files changed, 7 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index c524305cc9a7..4a6a697e1680 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -218,7 +218,9 @@ STORE(__cached_dev) - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); - -- d_strtoul_nonzero(writeback_rate_update_seconds); -+ sysfs_strtoul_clamp(writeback_rate_update_seconds, -+ dc->writeback_rate_update_seconds, -+ 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 58218f7e77c3..f1d2fc15abcc 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 8; - -- dc->writeback_rate_update_seconds = 5; -+ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 66f1c527fa24..587b25599856 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -8,6 +8,9 @@ - #define MAX_WRITEBACKS_IN_PASS 5 - #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ - -+#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 -+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 -+ - /* - * 14 (16384ths) is chosen here as something that each backing device - * should be a reasonable fraction of the share, and not to blow up --- -2.16.1 - diff --git a/for-next/v5/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v5/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index 1ff898a..0000000 --- a/for-next/v5/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,178 +0,0 @@ -From 8d90ae56c8b859dbd3b4360c8e011f5fee7b3540 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 23:05:58 +0800 -Subject: [PATCH v5 02/10] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - } - -+ if (atomic_read(&dc->has_dirty)) -+ cached_dev_put() -+ - return 0; - } -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Changelog: -v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 11 ++++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a2ad37a8afc0..7d96dc6860fa 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index f1d2fc15abcc..b280c134dd4d 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) - - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); -- return 0; -+ break; - } - - schedule(); -@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 587b25599856..0bba8f1c6cdf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.16.1 - diff --git a/for-next/v5/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v5/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch deleted file mode 100644 index cd0b2e6..0000000 --- a/for-next/v5/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch +++ /dev/null @@ -1,130 +0,0 @@ -From 26562d0421bf1fa18492e4089fead5b1f97616e2 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 21:41:57 +0800 -Subject: [PATCH v5 03/10] bcache: quit dc->writeback_thread when - BCACHE_DEV_DETACHING is set - -In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()", -cached_dev_get() is called when creating dc->writeback_thread, and -cached_dev_put() is called when exiting dc->writeback_thread. This -modification works well unless people detach the bcache device manually by - 'echo 1 > /sys/block/bcache<N>/bcache/detach' -Because this sysfs interface only calls bch_cached_dev_detach() which wakes -up dc->writeback_thread but does not stop it. The reason is, before patch -"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside -bch_writeback_thread(), if cache is not dirty after writeback, -cached_dev_put() will be called here. And in cached_dev_make_request() when -a new write request makes cache from clean to dirty, cached_dev_get() will -be called there. Since we don't operate dc->count in these locations, -refcount d->count cannot be dropped after cache becomes clean, and -cached_dev_detach_finish() won't be called to detach bcache device. - -This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is -set inside bch_writeback_thread(). If this bit is set and cache is clean -(no existing writeback_keys), break the while-loop, call cached_dev_put() -and quit the writeback thread. - -Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the -writeback thread should continue to perform writeback, this is the original -design of manually detach. - -It is safe to do the following check without locking, let me explain why, -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - -If the kenrel thread does not sleep and continue to run due to conditions -are not updated in time on the running CPU core, it just consumes more CPU -cycles and has no hurt. This should-sleep-but-run is safe here. We just -focus on the should-run-but-sleep condition, which means the writeback -thread goes to sleep in mistake while it should continue to run. -1, First of all, no matter the writeback thread is hung or not, kthread_stop() from - cached_dev_detach_finish() will wake up it and terminate by making - kthread_should_stop() return true. And in normal run time, bit on index - BCACHE_DEV_DETACHING is always cleared, the condition - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) - is always true and can be ignored as constant value. -2, If one of the following conditions is true, the writeback thread should - go to sleep, - "!atomic_read(&dc->has_dirty)" or "!dc->writeback_running)" - each of them independently controls the writeback thread should sleep or - not, let's analyse them one by one. -2.1 condition "!atomic_read(&dc->has_dirty)" - If dc->has_dirty is set from 0 to 1 on another CPU core, bcache will - call bch_writeback_queue() immediately or call bch_writeback_add() which - indirectly calls bch_writeback_queue() too. In bch_writeback_queue(), - wake_up_process(dc->writeback_thread) is called. It sets writeback - thread's task state to TASK_RUNNING and following an implicit memory - barrier, then tries to wake up the writeback thread. - In writeback thread, its task state is set to TASK_INTERRUPTIBLE before - doing the condition check. If other CPU core sets the TASK_RUNNING state - after writeback thread setting TASK_INTERRUPTIBLE, the writeback thread - will be scheduled to run very soon because its state is not - TASK_INTERRUPTIBLE. If other CPU core sets the TASK_RUNNING state before - writeback thread setting TASK_INTERRUPTIBLE, the implict memory barrier - of wake_up_process() will make sure modification of dc->has_dirty on - other CPU core is updated and observed on the CPU core of writeback - thread. Therefore the condition check will correctly be false, and - continue writeback code without sleeping. -2.2 condition "!dc->writeback_running)" - dc->writeback_running can be changed via sysfs file, every time it is - modified, a following bch_writeback_queue() is alwasy called. So the - change is always observed on the CPU core of writeback thread. If - dc->writeback_running is changed from 0 to 1 on other CPU core, this - condition check will observe the modification and allow writeback - thread to continue to run without sleeping. -Now we can see, even without a locking protection, multiple conditions -check is safe here, no deadlock or process hang up will happen. - -I compose a separte patch because that patch "bcache: fix cached_dev->count -usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes -Reinecke. Also this fix is not trivial and good for a separate patch. - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Huijun Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 20 +++++++++++++++++--- - 1 file changed, 17 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index b280c134dd4d..4dbeaaa575bf 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg) - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); -- if (!atomic_read(&dc->has_dirty) || -- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -- !dc->writeback_running)) { -+ /* -+ * If the bache device is detaching, skip here and continue -+ * to perform writeback. Otherwise, if no dirty data on cache, -+ * or there is dirty data on cache but writeback is disabled, -+ * the writeback thread should sleep here and wait for others -+ * to wake up it. -+ */ -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - - if (kthread_should_stop()) { -@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg) - atomic_set(&dc->has_dirty, 0); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); -+ /* -+ * If bcache device is detaching via sysfs interface, -+ * writeback thread should stop after there is no dirty -+ * data on cache. BCACHE_DEV_DETACHING flag is set in -+ * bch_cached_dev_detach(). -+ */ -+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) -+ break; - } - - up_write(&dc->writeback_lock); --- -2.16.1 - diff --git a/for-next/v5/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v5/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch deleted file mode 100644 index 909a381..0000000 --- a/for-next/v5/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch +++ /dev/null @@ -1,268 +0,0 @@ -From 0661a1f418c8efe59d19f952218c2faca0044275 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:48:39 +0800 -Subject: [PATCH v5 04/10] bcache: stop dc->writeback_rate_update properly - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, this delayed work is -stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -itself. That means when cancel_delayed_work_sync() returns, this delayed -work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will continue and release memory of struct cache set. -Then the delayed work is scheduled to run, __update_writeback_rate() -will reference the already released cache_set memory, and trigger a NULL -pointer deference fault. - -This patch introduces two more bcache device flags, -- BCACHE_DEV_WB_RUNNING - bit set: bcache device is in writeback mode and running, it is OK for - dc->writeback_rate_update to re-arm itself. - bit clear:bcache device is trying to stop dc->writeback_rate_update, - this delayed work should not re-arm itself and quit. -- BCACHE_DEV_RATE_DW_RUNNING - bit set: routine update_writeback_rate() is executing. - bit clear: routine update_writeback_rate() quits. - -This patch also adds a function cancel_writeback_rate_update_dwork() to -wait for dc->writeback_rate_update quits before cancel it by calling -cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected -quit dc->writeback_rate_update, after time_out seconds this function will -give up and continue to call cancel_delayed_work_sync(). - -And here I explain how this patch stops self re-armed delayed work properly -with the above stuffs. - -update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning -and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling -cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. - -Before calling cancel_delayed_work_sync() wait utill flag -BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling -cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- -armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases -delayed work routine update_writeback_rate() won't be executed after -cancel_delayed_work_sync() returns. - -Inside update_writeback_rate() before calling schedule_delayed_work(), flag -BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means -someone is about to stop the delayed work. Because flag -BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() -has to wait for this flag to be cleared, we don't need to worry about race -condition here. - -If update_writeback_rate() is scheduled to run after checking -BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() -in cancel_writeback_rate_update_dwork(), it is also safe. Because at this -moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned -previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear -and quit immediately. - -Because there are more dependences inside update_writeback_rate() to struct -cache_set memory, dc->writeback_rate_update is not a simple self re-arm -delayed work. After trying many different methods (e.g. hold dc->count, or -use locks), this is the only way I can find which works to properly stop -dc->writeback_rate_update delayed work. - -Changelog: -v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING - to bit index, for test_bit(). -v2: Try to fix the race issue which is pointed out by Junhui. -v1: The initial version for review - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/bcache.h | 9 +++++---- - drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 3 ++- - drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- - 4 files changed, 70 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index b8c2e1bef1f1..0380626bf525 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -258,10 +258,11 @@ struct bcache_device { - struct gendisk *disk; - - unsigned long flags; --#define BCACHE_DEV_CLOSING 0 --#define BCACHE_DEV_DETACHING 1 --#define BCACHE_DEV_UNLINK_DONE 2 -- -+#define BCACHE_DEV_CLOSING 0 -+#define BCACHE_DEV_DETACHING 1 -+#define BCACHE_DEV_UNLINK_DONE 2 -+#define BCACHE_DEV_WB_RUNNING 3 -+#define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 7d96dc6860fa..e15cacecf078 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) - pr_debug("error creating sysfs link"); - } - -+/* -+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed -+ * work dc->writeback_rate_update is running. Wait until the routine -+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to -+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out -+ * seconds, give up waiting here and continue to cancel it too. -+ */ -+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) -+{ -+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; -+ -+ do { -+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, -+ &dc->disk.flags)) -+ break; -+ time_out--; -+ schedule_timeout_interruptible(1); -+ } while (time_out > 0); -+ -+ if (time_out == 0) -+ pr_warn("give up waiting for dc->writeback_write_update" -+ " to quit"); -+ -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+} -+ - static void cached_dev_detach_finish(struct work_struct *w) - { - struct cached_dev *dc = container_of(w, struct cached_dev, detach); -@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) - - mutex_lock(&bch_register_lock); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) - closure_get(&dc->disk.cl); - - bch_writeback_queue(dc); -+ - cached_dev_put(dc); - } - -@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ mutex_lock(&bch_register_lock); -+ -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -- mutex_lock(&bch_register_lock); -- - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 4a6a697e1680..399e91cbf714 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -306,7 +306,8 @@ STORE(bch_cached_dev) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 4dbeaaa575bf..8f98ef1038d3 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev, - writeback_rate_update); - -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ return; -+ } -+ - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && -@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -+ } -+ -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); - } - - static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) -@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - } - -@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - return PTR_ERR(dc->writeback_thread); - } - -+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - --- -2.16.1 - diff --git a/for-next/v5/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v5/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch deleted file mode 100644 index 7444b9a..0000000 --- a/for-next/v5/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ /dev/null @@ -1,491 +0,0 @@ -From f9371b6b9d66ff73942770360cce17a72ca7625a Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 22:15:00 +0800 -Subject: [PATCH v5 05/10] bcache: add CACHE_SET_IO_DISABLE to struct cache_set - flags - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when flags bit -CACHE_SET_IO_DISABLE is set. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, -closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit -from cache_set->flags, to disable or enable cache set I/O for debugging. It -is helpful to trigger more corner case issues for failed cache device. - -Changelog -v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index. - remove "bcache: " prefix when printing out kernel message. -v2, more changes by previous review, -- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. -- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this - is reported and inspired from origal patch of Pavel Vazharov. -v1, initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Pavel Vazharov <freakpv@gmail.com> ---- - drivers/md/bcache/alloc.c | 3 ++- - drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ - drivers/md/bcache/btree.c | 10 +++++++--- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 6 +++++- - drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- - 10 files changed, 101 insertions(+), 29 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 458e1d38577d..004cc3cc6123 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,7 +287,8 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) { \ -+ if (kthread_should_stop() || \ -+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ - set_current_state(TASK_RUNNING); \ - return 0; \ - } \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 0380626bf525..7917b3820dd5 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -475,10 +475,15 @@ struct gc_stat { - * - * CACHE_SET_RUNNING means all cache devices have been registered and journal - * replay is complete. -+ * -+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all -+ * external and internal I/O should be denied when this flag is set. -+ * - */ - #define CACHE_SET_UNREGISTERING 0 - #define CACHE_SET_STOPPING 1 - #define CACHE_SET_RUNNING 2 -+#define CACHE_SET_IO_DISABLE 3 - - struct cache_set { - struct closure cl; -@@ -868,6 +873,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index fad9fe8817eb..8ca50f387a1d 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) - - btree_gc_start(c); - -+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ - do { - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); -@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) - - if (ret && ret != -EAGAIN) - pr_warn("gc failed!"); -- } while (ret); -+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - bch_btree_gc_finish(c); - wake_up_allocators(c); -@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 1b736b860739..c94085f400a4 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 1a46b41dac70..02296bda6384 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - -@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index e15cacecf078..f8b0d1196c12 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - test_bit(CACHE_SET_STOPPING, &c->flags)) - return false; - -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE already set"); -+ - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ -@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 399e91cbf714..cf973c07c856 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -95,6 +95,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -588,6 +589,8 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -677,6 +680,22 @@ STORE(__bch_cache_set) - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; - -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ if (v) { -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE" -+ " already set"); -+ } else { -+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE" -+ " already cleared"); -+ } -+ } -+ - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); -@@ -762,6 +781,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index a6763db7f061..268024529edd 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 8f98ef1038d3..3d7d8452e0de 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; - - /* - * should check BCACHE_DEV_RATE_DW_RUNNING before calling -@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); - -- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); -@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - atomic_set(&dc->writeback_sequence_next, next_sequence); -@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) - - next = bch_keybuf_next(&dc->writeback_keys); - -- while (!kthread_should_stop() && next) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ next) { - size = 0; - nk = 0; - -@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) - } - } - -- while (!kthread_should_stop() && delay) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ delay) { - schedule_timeout_interruptible(delay); - delay = writeback_delay(dc, 0); - } -@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - /* -@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg) - (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) { -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - set_current_state(TASK_RUNNING); - break; - } -@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg) - - while (delay && - !kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - --- -2.16.1 - diff --git a/for-next/v5/v5-0006-bcache-add-stop_when_cache_set_failed-option-to-b.patch b/for-next/v5/v5-0006-bcache-add-stop_when_cache_set_failed-option-to-b.patch deleted file mode 100644 index 3952ba1..0000000 --- a/for-next/v5/v5-0006-bcache-add-stop_when_cache_set_failed-option-to-b.patch +++ /dev/null @@ -1,258 +0,0 @@ -From fc5aa1aa4157619dc56f794419405b64a31a1312 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 23:44:28 +0800 -Subject: [PATCH v5 06/10] bcache: add stop_when_cache_set_failed option to - backing device - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -But we cannot simply stop all attached bcache devices when the cache set is -broken or disconnected. For example, use bcache to accelerate performance -of an email service. In such workload, if cache device is broken but no -dirty data lost, keep the bcache device alive and permit email service -continue to access user data might be a better solution for the cache -device failure. - -Nix <nix@esperi.org.uk> points out the issue and provides the above example -to explain why it might be necessary to not stop bcache device for broken -cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant -suggestion to provide "always" and "auto" options to per-cached device -sysfs file stop_when_cache_set_failed. If cache set is retiring and the -backing device has no dirty data on cache, it should be safe to keep the -bcache device alive. In this case, if stop_when_cache_set_failed is set to -"auto", the device failure handling code will not stop this bcache device -and permit application to access the backing device with a unattached -bcache device. - -Changelog: -v3: fix typos pointed out by Nix. -v2: change option values of stop_when_cache_set_failed from 1/0 to - "auto"/"always". -v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1 - (always stop). - -Signed-off-by: Coly Li <colyli@suse.de> -Cc: Nix <nix@esperi.org.uk> -Cc: Pavel Goran <via-bcache@pvgoran.name> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/bcache.h | 9 +++++ - drivers/md/bcache/super.c | 82 ++++++++++++++++++++++++++++++++++++++++------ - drivers/md/bcache/sysfs.c | 17 ++++++++++ - 3 files changed, 98 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 7917b3820dd5..263164490833 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -287,6 +287,12 @@ struct io { - sector_t last; - }; - -+enum stop_on_failure { -+ BCH_CACHED_DEV_STOP_AUTO = 0, -+ BCH_CACHED_DEV_STOP_ALWAYS, -+ BCH_CACHED_DEV_STOP_MODE_MAX, -+}; -+ - struct cached_dev { - struct list_head list; - struct bcache_device disk; -@@ -379,6 +385,8 @@ struct cached_dev { - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; -+ -+ enum stop_on_failure stop_when_cache_set_failed; - }; - - enum alloc_reserve { -@@ -924,6 +932,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *); - - extern struct workqueue_struct *bcache_wq; - extern const char * const bch_cache_modes[]; -+extern const char * const bch_stop_on_failure_modes[]; - extern struct mutex bch_register_lock; - extern struct list_head bch_cache_sets; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index f8b0d1196c12..e335433bdfb7 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = { - NULL - }; - -+/* Default is -1; we skip past it for stop_when_cache_set_failed */ -+const char * const bch_stop_on_failure_modes[] = { -+ "default", -+ "auto", -+ "always", -+ NULL -+}; -+ - static struct kobject *bcache_kobj; - struct mutex bch_register_lock; - LIST_HEAD(bch_cache_sets); -@@ -1187,6 +1195,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ /* default to auto */ -+ dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; -+ - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; -@@ -1463,25 +1474,76 @@ static void cache_set_flush(struct closure *cl) - closure_return(cl); - } - -+/* -+ * This function is only called when CACHE_SET_IO_DISABLE is set, which means -+ * cache set is unregistering due to too many I/O errors. In this condition, -+ * the bcache device might be stopped, it depends on stop_when_cache_set_failed -+ * value and whether the broken cache has dirty data: -+ * -+ * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device -+ * BCH_CACHED_STOP_ATUO 0 NO -+ * BCH_CACHED_STOP_ATUO 1 YES -+ * BCH_CACHED_DEV_STOP_ALWAYS 0 YES -+ * BCH_CACHED_DEV_STOP_ALWAYS 1 YES -+ * -+ * The expected behavior is, if stop_when_cache_set_failed is configured to -+ * "auto" via sysfs interface, the bcache device will not be stopped if the -+ * backing device is clean on the broken cache device. -+ */ -+static void conditional_stop_bcache_device(struct cache_set *c, -+ struct bcache_device *d, -+ struct cached_dev *dc) -+{ -+ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { -+ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop" -+ " it for failed cache set %pU.", -+ d->disk->disk_name, c->sb.set_uuid); -+ bcache_device_stop(d); -+ } else if (atomic_read(&dc->has_dirty)) { -+ /* -+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO -+ * and dc->has_dirty == 1 -+ */ -+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " -+ "cache is dirty, stop it to avoid potential data " -+ "corruption.", -+ d->disk->disk_name); -+ bcache_device_stop(d); -+ } else { -+ /* -+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO -+ * and dc->has_dirty == 0 -+ */ -+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " -+ "cache is clean, keep it alive.", -+ d->disk->disk_name); -+ } -+} -+ - static void __cache_set_unregister(struct closure *cl) - { - struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc; -+ struct bcache_device *d; - size_t i; - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->devices_max_used; i++) -- if (c->devices[i]) { -- if (!UUID_FLASH_ONLY(&c->uuids[i]) && -- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -- dc = container_of(c->devices[i], -- struct cached_dev, disk); -- bch_cached_dev_detach(dc); -- } else { -- bcache_device_stop(c->devices[i]); -- } -+ for (i = 0; i < c->devices_max_used; i++) { -+ d = c->devices[i]; -+ if (!d) -+ continue; -+ -+ if (!UUID_FLASH_ONLY(&c->uuids[i]) && -+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -+ dc = container_of(d, struct cached_dev, disk); -+ bch_cached_dev_detach(dc); -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ conditional_stop_bcache_device(c, d, dc); -+ } else { -+ bcache_device_stop(d); - } -+ } - - mutex_unlock(&bch_register_lock); - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index cf973c07c856..91d859a54575 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us); - rw_attribute(sequential_cutoff); - rw_attribute(data_csum); - rw_attribute(cache_mode); -+rw_attribute(stop_when_cache_set_failed); - rw_attribute(writeback_metadata); - rw_attribute(writeback_running); - rw_attribute(writeback_percent); -@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev) - bch_cache_modes + 1, - BDEV_CACHE_MODE(&dc->sb)); - -+ if (attr == &sysfs_stop_when_cache_set_failed) -+ return bch_snprint_string_list(buf, PAGE_SIZE, -+ bch_stop_on_failure_modes + 1, -+ dc->stop_when_cache_set_failed); -+ -+ - sysfs_printf(data_csum, "%i", dc->disk.data_csum); - var_printf(verify, "%i"); - var_printf(bypass_torture_test, "%i"); -@@ -247,6 +254,15 @@ STORE(__cached_dev) - } - } - -+ if (attr == &sysfs_stop_when_cache_set_failed) { -+ v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1); -+ -+ if (v < 0) -+ return v; -+ -+ dc->stop_when_cache_set_failed = v; -+ } -+ - if (attr == &sysfs_label) { - if (size > SB_LABEL_SIZE) - return -EINVAL; -@@ -323,6 +339,7 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_data_csum, - #endif - &sysfs_cache_mode, -+ &sysfs_stop_when_cache_set_failed, - &sysfs_writeback_metadata, - &sysfs_writeback_running, - &sysfs_writeback_delay, --- -2.16.1 - diff --git a/for-next/v5/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v5/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch deleted file mode 100644 index 79707be..0000000 --- a/for-next/v5/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ /dev/null @@ -1,119 +0,0 @@ -From 64c41825d56a1a0a7f7b468606a08bf6a86c21ba Mon Sep 17 00:00:00 2001 -From: Tang Junhui <tang.junhui@zte.com.cn> -Date: Tue, 9 Jan 2018 10:27:11 +0800 -Subject: [PATCH v5 07/10] bcache: fix inaccurate io state for detached bcache - devices - -When we run IO in a detached device, and run iostat to shows IO status, -normally it will show like bellow (Omitted some fields): -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 -bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 -but after IO stopped, there are still very big avgqu-sz and %util -values as bellow: -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 - -The reason for this issue is that, only generic_start_io_acct() called -and no generic_end_io_acct() called for detached device in -cached_dev_make_request(). See the code: -//start generic_start_io_acct() -generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); -if (cached_dev_get(dc)) { - //will callback generic_end_io_acct() -} -else { - //will not call generic_end_io_acct() -} - -This patch calls generic_end_io_acct() in the end of IO for detached -devices, so we can show IO state correctly. - -(Modified to use GFP_NOIO in kzalloc() by Coly Li) - -Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> -Reviewed-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 51 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 02296bda6384..e09c5ae745be 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) - continue_at(cl, cached_dev_bio_complete, NULL); - } - -+struct detached_dev_io_private { -+ struct bcache_device *d; -+ unsigned long start_time; -+ bio_end_io_t *bi_end_io; -+ void *bi_private; -+}; -+ -+static void detatched_dev_end_io(struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ -+ ddip = bio->bi_private; -+ bio->bi_end_io = ddip->bi_end_io; -+ bio->bi_private = ddip->bi_private; -+ -+ generic_end_io_acct(ddip->d->disk->queue, -+ bio_data_dir(bio), -+ &ddip->d->disk->part0, ddip->start_time); -+ -+ kfree(ddip); -+ -+ bio->bi_end_io(bio); -+} -+ -+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ /* -+ * no need to call closure_get(&dc->disk.cl), -+ * because upper layer had already opened bcache device, -+ * which would call closure_get(&dc->disk.cl) -+ */ -+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); -+ ddip->d = d; -+ ddip->start_time = jiffies; -+ ddip->bi_end_io = bio->bi_end_io; -+ ddip->bi_private = bio->bi_private; -+ bio->bi_end_io = detatched_dev_end_io; -+ bio->bi_private = ddip; -+ -+ if ((bio_op(bio) == REQ_OP_DISCARD) && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ bio->bi_end_io(bio); -+ else -+ generic_make_request(bio); -+} -+ - /* Cached devices - read & write stuff */ - - static blk_qc_t cached_dev_make_request(struct request_queue *q, -@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - else - cached_dev_read(dc, s); - } -- } else { -- if ((bio_op(bio) == REQ_OP_DISCARD) && -- !blk_queue_discard(bdev_get_queue(dc->bdev))) -- bio_endio(bio); -- else -- generic_make_request(bio); -- } -+ } else -+ detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; - } --- -2.16.1 - diff --git a/for-next/v5/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v5/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch deleted file mode 100644 index 18f7651..0000000 --- a/for-next/v5/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ /dev/null @@ -1,255 +0,0 @@ -From 38cfbb08de26e4e16d9f87307f132f4c7572e7bf Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:01:48 +0800 -Subject: [PATCH v5 08/10] bcache: add backing_request_endio() for bi_end_io of - attached backing device I/O - -In order to catch I/O error of backing device, a separate bi_end_io -call back is required. Then a per backing device counter can record I/O -errors number and retire the backing device if the counter reaches a -per backing device I/O error limit. - -This patch adds backing_request_endio() to bcache backing device I/O code -path, this is a preparation for further complicated backing device failure -handling. So far there is no real code logic change, I make this change a -separate patch to make sure it is stable and reliable for further work. - -Changelog: -v2: Fix code comments typo, remove a redundant bch_writeback_add() line - added in v4 patch set. -v1: indeed this is new added in this patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 93 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/super.c | 1 + - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 79 insertions(+), 16 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index e09c5ae745be..9c6dda3b0068 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) - } - - op->insert_data_done = true; -+ /* get in bch_data_insert() */ - bio_put(bio); - out: - continue_at(cl, bch_data_insert_keys, op->wq); -@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) - closure_put(cl); - } - -+static void backing_request_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ -+ if (bio->bi_status) { -+ struct search *s = container_of(cl, struct search, cl); -+ /* -+ * If a bio has REQ_PREFLUSH for writeback mode, it is -+ * speically assembled in cached_dev_write() for a non-zero -+ * write request which has REQ_PREFLUSH. we don't set -+ * s->iop.status by this failure, the status will be decided -+ * by result of bch_data_insert() operation. -+ */ -+ if (unlikely(s->iop.writeback && -+ bio->bi_opf & REQ_PREFLUSH)) { -+ char buf[BDEVNAME_SIZE]; -+ -+ bio_devname(bio, buf); -+ pr_err("Can't flush %s: returned bi_status %i", -+ buf, bio->bi_status); -+ } else { -+ /* set to orig_bio->bi_status in bio_complete() */ -+ s->iop.status = bio->bi_status; -+ } -+ s->recoverable = false; -+ /* should count I/O error for backing device here */ -+ } -+ -+ bio_put(bio); -+ closure_put(cl); -+} -+ - static void bio_complete(struct search *s) - { - if (s->orig_bio) { -@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) - } - } - --static void do_bio_hook(struct search *s, struct bio *orig_bio) -+static void do_bio_hook(struct search *s, -+ struct bio *orig_bio, -+ bio_end_io_t *end_io_fn) - { - struct bio *bio = &s->bio.bio; - - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); -- bio->bi_end_io = request_endio; -+ /* -+ * bi_end_io can be set separately somewhere else, e.g. the -+ * variants in, -+ * - cache_bio->bi_end_io from cached_dev_cache_miss() -+ * - n->bi_end_io from cache_lookup_fn() -+ */ -+ bio->bi_end_io = end_io_fn; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, - s = mempool_alloc(d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); -- do_bio_hook(s, bio); -+ do_bio_hook(s, bio, request_endio); - - s->orig_bio = bio; - s->cache_miss = NULL; -@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) - trace_bcache_read_retry(s->orig_bio); - - s->iop.status = 0; -- do_bio_hook(s, s->orig_bio); -+ do_bio_hook(s, s->orig_bio, backing_request_endio); - - /* XXX: invalidate cache */ - -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, bio, cl); - } - -@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - bio_copy_dev(cache_bio, miss); - cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - -- cache_bio->bi_end_io = request_endio; -+ cache_bio->bi_end_io = backing_request_endio; - cache_bio->bi_private = &s->cl; - - bch_bio_map(cache_bio, NULL); -@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: - bio_put(cache_bio); - out_submit: -- miss->bi_end_io = request_endio; -+ miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } -@@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - s->iop.bio = s->orig_bio; - bio_get(s->iop.bio); - -- if ((bio_op(bio) != REQ_OP_DISCARD) || -- blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(s->iop.c, bio, cl); -+ if (bio_op(bio) == REQ_OP_DISCARD && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ goto insert_data; -+ -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; -+ closure_bio_submit(s->iop.c, bio, cl); -+ - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; - - if (bio->bi_opf & REQ_PREFLUSH) { -- /* Also need to send a flush to the backing device */ -- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, -- dc->disk.bio_split); -- -+ /* -+ * Also need to send a flush to the backing -+ * device. -+ */ -+ struct bio *flush; -+ -+ flush = bio_alloc_bioset(GFP_NOIO, 0, -+ dc->disk.bio_split); -+ if (!flush) { -+ s->iop.status = BLK_STS_RESOURCE; -+ goto insert_data; -+ } - bio_copy_dev(flush, bio); -- flush->bi_end_io = request_endio; -+ flush->bi_end_io = backing_request_endio; - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); -- -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - } - -+insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); - } -@@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - cached_dev_read(dc, s); - } - } else -+ /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index e335433bdfb7..4f1a14b99415 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -273,6 +273,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) - bio->bi_private = dc; - - closure_get(cl); -+ /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 3d7d8452e0de..4ebe0119ea7e 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -+ /* I/O request sent to backing device */ - closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - --- -2.16.1 - diff --git a/for-next/v5/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v5/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch deleted file mode 100644 index 2ff139d..0000000 --- a/for-next/v5/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch +++ /dev/null @@ -1,237 +0,0 @@ -From 677f078827ce5ebde0a4aad6dfc0dc269433d622 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 23:49:47 +0800 -Subject: [PATCH v5 09/10] bcache: add io_disable to struct cached_dev - -If a bcache device is configured to writeback mode, current code does not -handle write I/O errors on backing devices properly. - -In writeback mode, write request is written to cache device, and -latter being flushed to backing device. If I/O failed when writing from -cache device to the backing device, bcache code just ignores the error and -upper layer code is NOT noticed that the backing device is broken. - -This patch tries to handle backing device failure like how the cache device -failure is handled, -- Add a error counter 'io_errors' and error limit 'error_limit' in struct - cached_dev. Add another io_disable to struct cached_dev to disable I/Os - on the problematic backing device. -- When I/O error happens on backing device, increase io_errors counter. And - if io_errors reaches error_limit, set cache_dev->io_disable to true, and - stop the bcache device. - -The result is, if backing device is broken of disconnected, and I/O errors -reach its error limit, backing device will be disabled and the associated -bcache device will be removed from system. - -Changelog: -v2: remove "bcache: " prefix in pr_error(), and use correct name string to - print out bcache device gendisk name. -v1: indeed this is new added in v2 patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 6 ++++++ - drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 14 ++++++++++++-- - drivers/md/bcache/super.c | 23 ++++++++++++++++++++++- - drivers/md/bcache/sysfs.c | 15 ++++++++++++++- - 5 files changed, 68 insertions(+), 4 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 263164490833..c59ce168bd82 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -366,6 +366,7 @@ struct cached_dev { - unsigned sequential_cutoff; - unsigned readahead; - -+ unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -@@ -387,6 +388,9 @@ struct cached_dev { - unsigned writeback_rate_minimum; - - enum stop_on_faliure stop_when_cache_set_failed; -+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 -+ atomic_t io_errors; -+ unsigned error_limit; - }; - - enum alloc_reserve { -@@ -896,6 +900,7 @@ static inline void closure_bio_submit(struct cache_set *c, - - /* Forward declarations */ - -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -@@ -923,6 +928,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); - bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); -+bool bch_cached_dev_error(struct cached_dev *dc); - - __printf(2, 3) - bool bch_cache_set_error(struct cache_set *, const char *, ...); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index 8013ecbcdbda..7fac97ae036e 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - } - - /* IO errors */ -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) -+{ -+ char buf[BDEVNAME_SIZE]; -+ unsigned errors; -+ -+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); -+ -+ errors = atomic_add_return(1, &dc->io_errors); -+ if (errors < dc->error_limit) -+ pr_err("%s: IO error on backing device, unrecoverable", -+ bio_devname(bio, buf)); -+ else -+ bch_cached_dev_error(dc); -+} - - void bch_count_io_errors(struct cache *ca, - blk_status_t error, -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 9c6dda3b0068..03245e6980a6 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) - - if (bio->bi_status) { - struct search *s = container_of(cl, struct search, cl); -+ struct cached_dev *dc = container_of(s->d, -+ struct cached_dev, disk); - /* - * If a bio has REQ_PREFLUSH for writeback mode, it is - * speically assembled in cached_dev_write() for a non-zero -@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) - } - s->recoverable = false; - /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); - } - - bio_put(bio); -@@ -1065,8 +1068,14 @@ static void detatched_dev_end_io(struct bio *bio) - bio_data_dir(bio), - &ddip->d->disk->part0, ddip->start_time); - -- kfree(ddip); -+ if (bio->bi_status) { -+ struct cached_dev *dc = container_of(ddip->d, -+ struct cached_dev, disk); -+ /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); -+ } - -+ kfree(ddip); - bio->bi_end_io(bio); - } - -@@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -+ dc->io_disable)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 4f1a14b99415..40b07d980a20 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1196,7 +1196,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -- /* default to auto */ -+ atomic_set(&dc->io_errors, 0); -+ dc->io_disable = false; -+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; -+ /* default to "auto" */ - dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_ATUO; - - bch_cached_dev_request_init(dc); -@@ -1350,6 +1353,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - return flash_dev_run(c, u); - } - -+bool bch_cached_dev_error(struct cached_dev *dc) -+{ -+ char name[BDEVNAME_SIZE]; -+ -+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) -+ return false; -+ -+ dc->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ -+ pr_err("stop %s: too many IO errors on backing device %s\n", -+ dc->disk.disk->disk_name, bdevname(dc->bdev, name)); -+ -+ bcache_device_stop(&dc->disk); -+ return true; -+} -+ - /* Cache set */ - - __printf(2, 3) -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 91d859a54575..e88fdcc549cd 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -141,7 +141,9 @@ SHOW(__bch_cached_dev) - var_print(writeback_delay); - var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); -- -+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_error_limit, "%i", dc->error_limit); -+ sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -232,6 +234,14 @@ STORE(__cached_dev) - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ dc->io_disable = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -349,6 +359,9 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_i_term_inverse, - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_debug, -+ &sysfs_errors, -+ &sysfs_io_error_limit, -+ &sysfs_io_disable, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.16.1 - diff --git a/for-next/v5/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v5/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index 5ebd851..0000000 --- a/for-next/v5/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,152 +0,0 @@ -From 88e4b7378283d942fe281f1b246be4a427a88511 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 23:52:40 +0800 -Subject: [PATCH v5 10/10] bcache: stop bcache device when backing device is - offline - -Currently bcache does not handle backing device failure, if backing -device is offline and disconnected from system, its bcache device can still -be accessible. If the bcache device is in writeback mode, I/O requests even -can success if the requests hit on cache device. That is to say, when and -how bcache handles offline backing device is undefined. - -This patch tries to handle backing device offline in a rather simple way, -- Add cached_dev->status_update_thread kernel thread to update backing - device status in every 1 second. -- Add cached_dev->offline_seconds to record how many seconds the backing - device is observed to be offline. If the backing device is offline for - BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and - call bcache_device_stop() to stop the bache device which linked to the - offline backing device. - -Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, -its bcache device will be removed, then user space application writing on -it will get error immediately, and handler the device failure in time. - -This patch is quite simple, does not handle more complicated situations. -Once the bcache device is stopped, users need to recovery the backing -device, register and attach it manually. - -Changelog: -v2: remove "bcache: " prefix when calling pr_warn(). -v1: initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 57 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index c59ce168bd82..aa83dd0f682f 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -344,6 +344,7 @@ struct cached_dev { - - struct keybuf writeback_keys; - -+ struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of -@@ -391,6 +392,7 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 40b07d980a20..6d672329efce 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -654,6 +654,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -864,6 +869,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) - c->cached_dev_sectors = sectors; - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 5 -+static int cached_dev_status_update(void *arg) -+{ -+ struct cached_dev *dc = arg; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ /* -+ * If this delayed worker is stopping outside, directly quit here. -+ * dc->io_disable might be set via sysfs interface, so check it -+ * here too. -+ */ -+ while (!kthread_should_stop() && !dc->io_disable) { -+ q = bdev_get_queue(dc->bdev); -+ if (blk_queue_dying(q)) -+ dc->offline_seconds++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), -+ BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing " -+ "device offline", dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ bcache_device_stop(&dc->disk); -+ break; -+ } -+ -+ schedule_timeout_interruptible(HZ); -+ } -+ -+ dc->status_update_thread = NULL; -+ return 0; -+} -+ - void bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; -@@ -906,6 +950,15 @@ void bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -+ -+ dc->status_update_thread = kthread_run(cached_dev_status_update, -+ dc, -+ "bcache_status_update"); -+ if (IS_ERR(dc->status_update_thread)) { -+ pr_warn("failed to create bcache_status_update kthread, " -+ "continue to run without monitoring backing " -+ "device status"); -+ } - } - - /* -@@ -1126,6 +1179,8 @@ static void cached_dev_free(struct closure *cl) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); -+ if (!IS_ERR_OR_NULL(dc->status_update_thread)) -+ kthread_stop(dc->status_update_thread); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); --- -2.16.1 - diff --git a/for-next/v6/v6-0000-cover-letter.patch b/for-next/v6/v6-0000-cover-letter.patch deleted file mode 100644 index a78da69..0000000 --- a/for-next/v6/v6-0000-cover-letter.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 15a23aafbcfd8e92fc7e1740c8e53d9c64c9fde1 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 8 Feb 2018 23:13:49 +0800 -Subject: [PATCH v6 0/9] bcache: device failure handling improvement - -Hi maintainers and folks, - -This patch set tries to improve bcache device failure handling, includes -cache device and backing device failures. - -The basic idea to handle failed cache device is, -- Unregister cache set -- Detach all backing devices which are attached to this cache set -- Stop all the detached bcache devices (configurable) -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed, following -I/O requests will get failed immediately to notift upper layer or user -space coce that the cache device is failed or disconnected. -- Stop all the detached bcache devices (configurable) -- Stop all flash only volume on the cache set -The above process is named 'cache set retire' by me. The result of cache -set retire is, cache set and bcache devices are all removed -(configurable), following I/O requests will get failed immediately to -notify upper layer or user space coce that the cache device is failed or -disconnected. - -one patch from v5 patch set is merged into bcache-for-next, which is not -in v6 patch set any longer. The changes of v6 patch set are only for typo -fix, which were pointed out by Nix, Michael and other developers. - -So far all patches have peer review, thank you all, bcache developers! - -Changelog: -v6: fix typo and mistaken spelling. -v5: replace patch "bcache: stop all attached bcache devices for a retired - cache set" from v4 patch set by "bcache: add stop_when_cache_set_failed - option to backing device" from v5 patch set. - fix issues from v4 patch set. - improve kernel message format, remove redundant prefix string. -v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping - attached bcache device from a retiring cache set. -v3: fix detach issue find in v2 patch set. -v2: fixes all problems found in v1 review. - add patches to handle backing device failure. - add one more patch to set writeback_rate_update_seconds range. - include a patch from Junhui Tang. -v1: the initial version, only handles cache device failure. - -Coly Li ---- - -Coly Li (8): - bcache: fix cached_dev->count usage for bch_cache_set_error() - bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set - bcache: stop dc->writeback_rate_update properly - bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags - bcache: add stop_when_cache_set_failed option to backing device - bcache: add backing_request_endio() for bi_end_io of attached backing - device I/O - bcache: add io_disable to struct cached_dev - bcache: stop bcache device when backing device is offline - -Tang Junhui (1): - bcache: fix inaccurate io state for detached bcache devices - - drivers/md/bcache/alloc.c | 3 +- - drivers/md/bcache/bcache.h | 44 ++++++++- - drivers/md/bcache/btree.c | 10 ++- - drivers/md/bcache/io.c | 16 +++- - drivers/md/bcache/journal.c | 4 +- - drivers/md/bcache/request.c | 185 ++++++++++++++++++++++++++++++++------ - drivers/md/bcache/super.c | 205 ++++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 55 +++++++++++- - drivers/md/bcache/util.h | 6 -- - drivers/md/bcache/writeback.c | 92 ++++++++++++++++--- - drivers/md/bcache/writeback.h | 2 - - 11 files changed, 543 insertions(+), 79 deletions(-) - --- -2.16.1 - diff --git a/for-next/v6/v6-0001-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v6/v6-0001-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch deleted file mode 100644 index c12cf09..0000000 --- a/for-next/v6/v6-0001-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ /dev/null @@ -1,178 +0,0 @@ -From af3ba20f8ddf139828f6b26e0dfeeea71aa5b6c9 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 8 Jan 2018 23:05:58 +0800 -Subject: [PATCH v6 1/9] bcache: fix cached_dev->count usage for - bch_cache_set_error() - -When bcache metadata I/O fails, bcache will call bch_cache_set_error() -to retire the whole cache set. The expected behavior to retire a cache -set is to unregister the cache set, and unregister all backing device -attached to this cache set, then remove sysfs entries of the cache set -and all attached backing devices, finally release memory of structs -cache_set, cache, cached_dev and bcache_device. - -In my testing when journal I/O failure triggered by disconnected cache -device, sometimes the cache set cannot be retired, and its sysfs -entry /sys/fs/bcache/<uuid> still exits and the backing device also -references it. This is not expected behavior. - -When metadata I/O failes, the call senquence to retire whole cache set is, - bch_cache_set_error() - bch_cache_set_unregister() - bch_cache_set_stop() - __cache_set_unregister() <- called as callback by calling - clousre_queue(&c->caching) - cache_set_flush() <- called as a callback when refcount - of cache_set->caching is 0 - cache_set_free() <- called as a callback when refcount - of catch_set->cl is 0 - bch_cache_set_release() <- called as a callback when refcount - of catch_set->kobj is 0 - -I find if kernel thread bch_writeback_thread() quits while-loop when -kthread_should_stop() is true and searched_full_index is false, clousre -callback cache_set_flush() set by continue_at() will never be called. The -result is, bcache fails to retire whole cache set. - -cache_set_flush() will be called when refcount of closure c->caching is 0, -and in function bcache_device_detach() refcount of closure c->caching is -released to 0 by clousre_put(). In metadata error code path, function -bcache_device_detach() is called by cached_dev_detach_finish(). This is a -callback routine being called when cached_dev->count is 0. This refcount -is decreased by cached_dev_put(). - -The above dependence indicates, cache_set_flush() will be called when -refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 -when refcount of cache_dev->count is 0. - -The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails -and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount -of cache_dev is not decreased properly. - -In bch_writeback_thread(), cached_dev_put() is called only when -searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a -there is no dirty data on cache. In most of run time it is correct, but -when bch_writeback_thread() quits the while-loop while cache is still -dirty, current code forget to call cached_dev_put() before this kernel -thread exits. This is why sometimes cache_set_flush() is not executed and -cache set fails to be retired. - -The reason to call cached_dev_put() in bch_writeback_rate() is, when the -cache device changes from clean to dirty, cached_dev_get() is called, to -make sure during writeback operatiions both backing and cache devices -won't be released. - -Adding following code in bch_writeback_thread() does not work, - static int bch_writeback_thread(void *arg) - } - -+ if (atomic_read(&dc->has_dirty)) -+ cached_dev_put() -+ - return 0; - } -because writeback kernel thread can be waken up and start via sysfs entry: - echo 1 > /sys/block/bcache<N>/bcache/writeback_running -It is difficult to check whether backing device is dirty without race and -extra lock. So the above modification will introduce potential refcount -underflow in some conditions. - -The correct fix is, to take cached dev refcount when creating the kernel -thread, and put it before the kernel thread exits. Then bcache does not -need to take a cached dev refcount when cache turns from clean to dirty, -or to put a cached dev refcount when cache turns from ditry to clean. The -writeback kernel thread is alwasy safe to reference data structure from -cache set, cache and cached device (because a refcount of cache device is -taken for it already), and no matter the kernel thread is stopped by I/O -errors or system reboot, cached_dev->count can always be used correctly. - -The patch is simple, but understanding how it works is quite complicated. - -Changelog: -v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. -v1: initial version for review. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/writeback.c | 11 ++++++++--- - drivers/md/bcache/writeback.h | 2 -- - 3 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 312895788036..9b745c5c1980 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1054,7 +1054,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(&dc->disk); - atomic_set(&dc->has_dirty, 1); -- refcount_inc(&dc->count); - bch_writeback_queue(dc); - } - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index f1d2fc15abcc..b280c134dd4d 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) - - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); -- return 0; -+ break; - } - - schedule(); -@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) - if (searched_full_index && - RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { - atomic_set(&dc->has_dirty, 0); -- cached_dev_put(dc); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - } -@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) - } - } - -+ dc->writeback_thread = NULL; -+ cached_dev_put(dc); -+ - return 0; - } - -@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - if (!dc->writeback_write_wq) - return -ENOMEM; - -+ cached_dev_get(dc); - dc->writeback_thread = kthread_create(bch_writeback_thread, dc, - "bcache_writeback"); -- if (IS_ERR(dc->writeback_thread)) -+ if (IS_ERR(dc->writeback_thread)) { -+ cached_dev_put(dc); - return PTR_ERR(dc->writeback_thread); -+ } - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h -index 587b25599856..0bba8f1c6cdf 100644 ---- a/drivers/md/bcache/writeback.h -+++ b/drivers/md/bcache/writeback.h -@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) - { - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { -- refcount_inc(&dc->count); -- - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ --- -2.16.1 - diff --git a/for-next/v6/v6-0002-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v6/v6-0002-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch deleted file mode 100644 index 4dd2c66..0000000 --- a/for-next/v6/v6-0002-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch +++ /dev/null @@ -1,130 +0,0 @@ -From dd599ee72b29e153026448f1ff0b7147027a892a Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 21:41:57 +0800 -Subject: [PATCH v6 2/9] bcache: quit dc->writeback_thread when - BCACHE_DEV_DETACHING is set - -In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()", -cached_dev_get() is called when creating dc->writeback_thread, and -cached_dev_put() is called when exiting dc->writeback_thread. This -modification works well unless people detach the bcache device manually by - 'echo 1 > /sys/block/bcache<N>/bcache/detach' -Because this sysfs interface only calls bch_cached_dev_detach() which wakes -up dc->writeback_thread but does not stop it. The reason is, before patch -"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside -bch_writeback_thread(), if cache is not dirty after writeback, -cached_dev_put() will be called here. And in cached_dev_make_request() when -a new write request makes cache from clean to dirty, cached_dev_get() will -be called there. Since we don't operate dc->count in these locations, -refcount d->count cannot be dropped after cache becomes clean, and -cached_dev_detach_finish() won't be called to detach bcache device. - -This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is -set inside bch_writeback_thread(). If this bit is set and cache is clean -(no existing writeback_keys), break the while-loop, call cached_dev_put() -and quit the writeback thread. - -Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the -writeback thread should continue to perform writeback, this is the original -design of manually detach. - -It is safe to do the following check without locking, let me explain why, -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - -If the kenrel thread does not sleep and continue to run due to conditions -are not updated in time on the running CPU core, it just consumes more CPU -cycles and has no hurt. This should-sleep-but-run is safe here. We just -focus on the should-run-but-sleep condition, which means the writeback -thread goes to sleep in mistake while it should continue to run. -1, First of all, no matter the writeback thread is hung or not, kthread_stop() from - cached_dev_detach_finish() will wake up it and terminate by making - kthread_should_stop() return true. And in normal run time, bit on index - BCACHE_DEV_DETACHING is always cleared, the condition - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) - is always true and can be ignored as constant value. -2, If one of the following conditions is true, the writeback thread should - go to sleep, - "!atomic_read(&dc->has_dirty)" or "!dc->writeback_running)" - each of them independently controls the writeback thread should sleep or - not, let's analyse them one by one. -2.1 condition "!atomic_read(&dc->has_dirty)" - If dc->has_dirty is set from 0 to 1 on another CPU core, bcache will - call bch_writeback_queue() immediately or call bch_writeback_add() which - indirectly calls bch_writeback_queue() too. In bch_writeback_queue(), - wake_up_process(dc->writeback_thread) is called. It sets writeback - thread's task state to TASK_RUNNING and following an implicit memory - barrier, then tries to wake up the writeback thread. - In writeback thread, its task state is set to TASK_INTERRUPTIBLE before - doing the condition check. If other CPU core sets the TASK_RUNNING state - after writeback thread setting TASK_INTERRUPTIBLE, the writeback thread - will be scheduled to run very soon because its state is not - TASK_INTERRUPTIBLE. If other CPU core sets the TASK_RUNNING state before - writeback thread setting TASK_INTERRUPTIBLE, the implict memory barrier - of wake_up_process() will make sure modification of dc->has_dirty on - other CPU core is updated and observed on the CPU core of writeback - thread. Therefore the condition check will correctly be false, and - continue writeback code without sleeping. -2.2 condition "!dc->writeback_running)" - dc->writeback_running can be changed via sysfs file, every time it is - modified, a following bch_writeback_queue() is alwasy called. So the - change is always observed on the CPU core of writeback thread. If - dc->writeback_running is changed from 0 to 1 on other CPU core, this - condition check will observe the modification and allow writeback - thread to continue to run without sleeping. -Now we can see, even without a locking protection, multiple conditions -check is safe here, no deadlock or process hang up will happen. - -I compose a separte patch because that patch "bcache: fix cached_dev->count -usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes -Reinecke. Also this fix is not trivial and good for a separate patch. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> -Cc: Huijun Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/writeback.c | 20 +++++++++++++++++--- - 1 file changed, 17 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index b280c134dd4d..4dbeaaa575bf 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg) - while (!kthread_should_stop()) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); -- if (!atomic_read(&dc->has_dirty) || -- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -- !dc->writeback_running)) { -+ /* -+ * If the bache device is detaching, skip here and continue -+ * to perform writeback. Otherwise, if no dirty data on cache, -+ * or there is dirty data on cache but writeback is disabled, -+ * the writeback thread should sleep here and wait for others -+ * to wake up it. -+ */ -+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && -+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - - if (kthread_should_stop()) { -@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg) - atomic_set(&dc->has_dirty, 0); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); -+ /* -+ * If bcache device is detaching via sysfs interface, -+ * writeback thread should stop after there is no dirty -+ * data on cache. BCACHE_DEV_DETACHING flag is set in -+ * bch_cached_dev_detach(). -+ */ -+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) -+ break; - } - - up_write(&dc->writeback_lock); --- -2.16.1 - diff --git a/for-next/v6/v6-0003-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v6/v6-0003-bcache-stop-dc-writeback_rate_update-properly.patch deleted file mode 100644 index c3035a0..0000000 --- a/for-next/v6/v6-0003-bcache-stop-dc-writeback_rate_update-properly.patch +++ /dev/null @@ -1,268 +0,0 @@ -From 437a0616a6c5fc0da56798a9e60375e3a2b22683 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 13 Jan 2018 15:48:39 +0800 -Subject: [PATCH v6 3/9] bcache: stop dc->writeback_rate_update properly - -struct delayed_work writeback_rate_update in struct cache_dev is a delayed -worker to call function update_writeback_rate() in period (the interval is -defined by dc->writeback_rate_update_seconds). - -When a metadate I/O error happens on cache device, bcache error handling -routine bch_cache_set_error() will call bch_cache_set_unregister() to -retire whole cache set. On the unregister code path, this delayed work is -stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). - -dc->writeback_rate_update is a special delayed work from others in bcache. -In its routine update_writeback_rate(), this delayed work is re-armed -itself. That means when cancel_delayed_work_sync() returns, this delayed -work can still be executed after several seconds defined by -dc->writeback_rate_update_seconds. - -The problem is, after cancel_delayed_work_sync() returns, the cache set -unregister code path will continue and release memory of struct cache set. -Then the delayed work is scheduled to run, __update_writeback_rate() -will reference the already released cache_set memory, and trigger a NULL -pointer deference fault. - -This patch introduces two more bcache device flags, -- BCACHE_DEV_WB_RUNNING - bit set: bcache device is in writeback mode and running, it is OK for - dc->writeback_rate_update to re-arm itself. - bit clear:bcache device is trying to stop dc->writeback_rate_update, - this delayed work should not re-arm itself and quit. -- BCACHE_DEV_RATE_DW_RUNNING - bit set: routine update_writeback_rate() is executing. - bit clear: routine update_writeback_rate() quits. - -This patch also adds a function cancel_writeback_rate_update_dwork() to -wait for dc->writeback_rate_update quits before cancel it by calling -cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected -quit dc->writeback_rate_update, after time_out seconds this function will -give up and continue to call cancel_delayed_work_sync(). - -And here I explain how this patch stops self re-armed delayed work properly -with the above stuffs. - -update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning -and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling -cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. - -Before calling cancel_delayed_work_sync() wait utill flag -BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling -cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- -armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases -delayed work routine update_writeback_rate() won't be executed after -cancel_delayed_work_sync() returns. - -Inside update_writeback_rate() before calling schedule_delayed_work(), flag -BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means -someone is about to stop the delayed work. Because flag -BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() -has to wait for this flag to be cleared, we don't need to worry about race -condition here. - -If update_writeback_rate() is scheduled to run after checking -BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() -in cancel_writeback_rate_update_dwork(), it is also safe. Because at this -moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned -previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear -and quit immediately. - -Because there are more dependences inside update_writeback_rate() to struct -cache_set memory, dc->writeback_rate_update is not a simple self re-arm -delayed work. After trying many different methods (e.g. hold dc->count, or -use locks), this is the only way I can find which works to properly stop -dc->writeback_rate_update delayed work. - -Changelog: -v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING - to bit index, for test_bit(). -v2: Try to fix the race issue which is pointed out by Junhui. -v1: The initial version for review - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/bcache.h | 9 +++++---- - drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/sysfs.c | 3 ++- - drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- - 4 files changed, 70 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 12e5197f186c..b5ddb848cd31 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -258,10 +258,11 @@ struct bcache_device { - struct gendisk *disk; - - unsigned long flags; --#define BCACHE_DEV_CLOSING 0 --#define BCACHE_DEV_DETACHING 1 --#define BCACHE_DEV_UNLINK_DONE 2 -- -+#define BCACHE_DEV_CLOSING 0 -+#define BCACHE_DEV_DETACHING 1 -+#define BCACHE_DEV_UNLINK_DONE 2 -+#define BCACHE_DEV_WB_RUNNING 3 -+#define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned nr_stripes; - unsigned stripe_size; - atomic_t *stripe_sectors_dirty; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 9b745c5c1980..531cd967c05f 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) - pr_debug("error creating sysfs link"); - } - -+/* -+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed -+ * work dc->writeback_rate_update is running. Wait until the routine -+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to -+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out -+ * seconds, give up waiting here and continue to cancel it too. -+ */ -+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) -+{ -+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; -+ -+ do { -+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, -+ &dc->disk.flags)) -+ break; -+ time_out--; -+ schedule_timeout_interruptible(1); -+ } while (time_out > 0); -+ -+ if (time_out == 0) -+ pr_warn("give up waiting for dc->writeback_write_update" -+ " to quit"); -+ -+ cancel_delayed_work_sync(&dc->writeback_rate_update); -+} -+ - static void cached_dev_detach_finish(struct work_struct *w) - { - struct cached_dev *dc = container_of(w, struct cached_dev, detach); -@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) - - mutex_lock(&bch_register_lock); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) { - kthread_stop(dc->writeback_thread); - dc->writeback_thread = NULL; -@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) - closure_get(&dc->disk.cl); - - bch_writeback_queue(dc); -+ - cached_dev_put(dc); - } - -@@ -1081,14 +1110,16 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- cancel_delayed_work_sync(&dc->writeback_rate_update); -+ mutex_lock(&bch_register_lock); -+ -+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ cancel_writeback_rate_update_dwork(dc); -+ - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); - -- mutex_lock(&bch_register_lock); -- - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 78cd7bd50fdd..55673508628f 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -309,7 +309,8 @@ STORE(bch_cached_dev) - bch_writeback_queue(dc); - - if (attr == &sysfs_writeback_percent) -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 4dbeaaa575bf..8f98ef1038d3 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev, - writeback_rate_update); - -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); -+ return; -+ } -+ - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && -@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- schedule_delayed_work(&dc->writeback_rate_update, -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); -+ } -+ -+ /* -+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling -+ * cancel_delayed_work_sync(). -+ */ -+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); -+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ -+ smp_mb(); - } - - static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) -@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) - dc->writeback_rate_p_term_inverse = 40; - dc->writeback_rate_i_term_inverse = 10000; - -+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); - } - -@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - return PTR_ERR(dc->writeback_thread); - } - -+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - --- -2.16.1 - diff --git a/for-next/v6/v6-0004-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v6/v6-0004-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch deleted file mode 100644 index 9321e09..0000000 --- a/for-next/v6/v6-0004-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ /dev/null @@ -1,491 +0,0 @@ -From 40bdef8329171e783ee21bc04670cea7fca8aa8f Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 14 Jan 2018 22:15:00 +0800 -Subject: [PATCH v6 4/9] bcache: add CACHE_SET_IO_DISABLE to struct cache_set - flags - -When too many I/Os failed on cache device, bch_cache_set_error() is called -in the error handling code path to retire whole problematic cache set. If -new I/O requests continue to come and take refcount dc->count, the cache -set won't be retired immediately, this is a problem. - -Further more, there are several kernel thread and self-armed kernel work -may still running after bch_cache_set_error() is called. It needs to wait -quite a while for them to stop, or they won't stop at all. They also -prevent the cache set from being retired. - -The solution in this patch is, to add per cache set flag to disable I/O -request on this cache and all attached backing devices. Then new coming I/O -requests can be rejected in *_make_request() before taking refcount, kernel -threads and self-armed kernel worker can stop very fast when flags bit -CACHE_SET_IO_DISABLE is set. - -Because bcache also do internal I/Os for writeback, garbage collection, -bucket allocation, journaling, this kind of I/O should be disabled after -bch_cache_set_error() is called. So closure_bio_submit() is modified to -check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, -closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and -return, generic_make_request() won't be called. - -A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit -from cache_set->flags, to disable or enable cache set I/O for debugging. It -is helpful to trigger more corner case issues for failed cache device. - -Changelog -v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index. - remove "bcache: " prefix when printing out kernel message. -v2, more changes by previous review, -- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. -- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this - is reported and inspired from origal patch of Pavel Vazharov. -v1, initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Pavel Vazharov <freakpv@gmail.com> ---- - drivers/md/bcache/alloc.c | 3 ++- - drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ - drivers/md/bcache/btree.c | 10 +++++++--- - drivers/md/bcache/io.c | 2 +- - drivers/md/bcache/journal.c | 4 ++-- - drivers/md/bcache/request.c | 26 +++++++++++++++++++------- - drivers/md/bcache/super.c | 6 +++++- - drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ - drivers/md/bcache/util.h | 6 ------ - drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- - 10 files changed, 101 insertions(+), 29 deletions(-) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index 458e1d38577d..004cc3cc6123 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -287,7 +287,8 @@ do { \ - break; \ - \ - mutex_unlock(&(ca)->set->bucket_lock); \ -- if (kthread_should_stop()) { \ -+ if (kthread_should_stop() || \ -+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ - set_current_state(TASK_RUNNING); \ - return 0; \ - } \ -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index b5ddb848cd31..56179fff1e59 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -475,10 +475,15 @@ struct gc_stat { - * - * CACHE_SET_RUNNING means all cache devices have been registered and journal - * replay is complete. -+ * -+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all -+ * external and internal I/O should be denied when this flag is set. -+ * - */ - #define CACHE_SET_UNREGISTERING 0 - #define CACHE_SET_STOPPING 1 - #define CACHE_SET_RUNNING 2 -+#define CACHE_SET_IO_DISABLE 3 - - struct cache_set { - struct closure cl; -@@ -868,6 +873,19 @@ static inline void wake_up_allocators(struct cache_set *c) - wake_up_process(ca->alloc_thread); - } - -+static inline void closure_bio_submit(struct cache_set *c, -+ struct bio *bio, -+ struct closure *cl) -+{ -+ closure_get(cl); -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return; -+ } -+ generic_make_request(bio); -+} -+ - /* Forward declarations */ - - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index fad9fe8817eb..8ca50f387a1d 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) - - btree_gc_start(c); - -+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ - do { - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); -@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) - - if (ret && ret != -EAGAIN) - pr_warn("gc failed!"); -- } while (ret); -+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - bch_btree_gc_finish(c); - wake_up_allocators(c); -@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) - - while (1) { - wait_event_interruptible(c->gc_wait, -- kthread_should_stop() || gc_should_run(c)); -+ kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || -+ gc_should_run(c)); - -- if (kthread_should_stop()) -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) - break; - - set_gc_sectors(c); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index a783c5a41ff1..8013ecbcdbda 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); - - b->submit_time_us = local_clock_us(); -- closure_bio_submit(bio, bio->bi_private); -+ closure_bio_submit(c, bio, bio->bi_private); - } - - void bch_submit_bbio(struct bio *bio, struct cache_set *c, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 1b736b860739..c94085f400a4 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - -- closure_bio_submit(bio, &cl); -+ closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); - - /* This function could be simpler now since we no longer write -@@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl) - spin_unlock(&c->journal.lock); - - while ((bio = bio_list_pop(&list))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(c, bio, cl); - - continue_at(cl, journal_write_done, NULL); - } -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 1a46b41dac70..02296bda6384 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) - - /* XXX: invalidate cache */ - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - continue_at(cl, cached_dev_cache_miss_done, NULL); -@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -- closure_bio_submit(cache_bio, &s->cl); -+ closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: -@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - out_submit: - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; -- closure_bio_submit(miss, &s->cl); -+ closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } - -@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; -@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - -- closure_bio_submit(flush, cl); -+ closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - } - - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); -@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -- closure_bio_submit(bio, cl); -+ closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); - } -@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - -@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); - -+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ bio->bi_status = BLK_STS_IOERR; -+ bio_endio(bio); -+ return BLK_QC_T_NONE; -+ } -+ - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 531cd967c05f..a1abeebc7643 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); - bch_bio_map(bio, ca->disk_buckets); - -- closure_bio_submit(bio, &ca->prio); -+ closure_bio_submit(ca->set, bio, &ca->prio); - closure_sync(cl); - } - -@@ -1351,6 +1351,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - test_bit(CACHE_SET_STOPPING, &c->flags)) - return false; - -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE already set"); -+ - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ -@@ -1586,6 +1589,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - c->congested_read_threshold_us = 2000; - c->congested_write_threshold_us = 20000; - c->error_limit = DEFAULT_IO_ERROR_LIMIT; -+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - return c; - err: -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 55673508628f..e75279b7d180 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -95,6 +95,7 @@ read_attribute(partial_stripes_expensive); - - rw_attribute(synchronous); - rw_attribute(journal_delay_ms); -+rw_attribute(io_disable); - rw_attribute(discard); - rw_attribute(running); - rw_attribute(label); -@@ -591,6 +592,8 @@ SHOW(__bch_cache_set) - sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); - sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ sysfs_printf(io_disable, "%i", -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); - - if (attr == &sysfs_bset_tree_stats) - return bch_bset_print_stats(c, buf); -@@ -680,6 +683,22 @@ STORE(__bch_cache_set) - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; - -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ if (v) { -+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE" -+ " already set"); -+ } else { -+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, -+ &c->flags)) -+ pr_warn("CACHE_SET_IO_DISABLE" -+ " already cleared"); -+ } -+ } -+ - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); -@@ -765,6 +784,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_gc_always_rewrite, - &sysfs_btree_shrinker_disabled, - &sysfs_copy_gc_enabled, -+ &sysfs_io_disable, - NULL - }; - KTYPE(bch_cache_set_internal); -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index a6763db7f061..268024529edd 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) - return bdev->bd_inode->i_size >> 9; - } - --#define closure_bio_submit(bio, cl) \ --do { \ -- closure_get(cl); \ -- generic_make_request(bio); \ --} while (0) -- - uint64_t bch_crc64_update(uint64_t, const void *, size_t); - uint64_t bch_crc64(const void *, size_t); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 8f98ef1038d3..3d7d8452e0de 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) - struct cached_dev *dc = container_of(to_delayed_work(work), - struct cached_dev, - writeback_rate_update); -+ struct cache_set *c = dc->disk.c; - - /* - * should check BCACHE_DEV_RATE_DW_RUNNING before calling -@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); - -- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); - /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); -@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) - - up_read(&dc->writeback_lock); - -- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { -+ /* -+ * CACHE_SET_IO_DISABLE might be set via sysfs interface, -+ * check it here too. -+ */ -+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - - atomic_set(&dc->writeback_sequence_next, next_sequence); -@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) - { - struct dirty_io *io = container_of(cl, struct dirty_io, cl); - -- closure_bio_submit(&io->bio, cl); -+ closure_bio_submit(io->dc->disk.c, &io->bio, cl); - - continue_at(cl, write_dirty, io->dc->writeback_write_wq); - } -@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) - - next = bch_keybuf_next(&dc->writeback_keys); - -- while (!kthread_should_stop() && next) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ next) { - size = 0; - nk = 0; - -@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) - } - } - -- while (!kthread_should_stop() && delay) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && -+ delay) { - schedule_timeout_interruptible(delay); - delay = writeback_delay(dc, 0); - } -@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) - static int bch_writeback_thread(void *arg) - { - struct cached_dev *dc = arg; -+ struct cache_set *c = dc->disk.c; - bool searched_full_index; - - bch_ratelimit_reset(&dc->writeback_rate); - -- while (!kthread_should_stop()) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - down_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - /* -@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg) - (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { - up_write(&dc->writeback_lock); - -- if (kthread_should_stop()) { -+ if (kthread_should_stop() || -+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { - set_current_state(TASK_RUNNING); - break; - } -@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg) - - while (delay && - !kthread_should_stop() && -+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && - !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); - --- -2.16.1 - diff --git a/for-next/v6/v6-0005-bcache-add-stop_when_cache_set_failed-option-to-b.patch b/for-next/v6/v6-0005-bcache-add-stop_when_cache_set_failed-option-to-b.patch deleted file mode 100644 index 085894e..0000000 --- a/for-next/v6/v6-0005-bcache-add-stop_when_cache_set_failed-option-to-b.patch +++ /dev/null @@ -1,258 +0,0 @@ -From e508bc1b82e5720315bfb28b3c42e9333a6ec8ce Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 23:44:28 +0800 -Subject: [PATCH v6 5/9] bcache: add stop_when_cache_set_failed option to - backing device - -When there are too many I/O errors on cache device, current bcache code -will retire the whole cache set, and detach all bcache devices. But the -detached bcache devices are not stopped, which is problematic when bcache -is in writeback mode. - -If the retired cache set has dirty data of backing devices, continue -writing to bcache device will write to backing device directly. If the -LBA of write request has a dirty version cached on cache device, next time -when the cache device is re-registered and backing device re-attached to -it again, the stale dirty data on cache device will be written to backing -device, and overwrite latest directly written data. This situation causes -a quite data corruption. - -But we cannot simply stop all attached bcache devices when the cache set is -broken or disconnected. For example, use bcache to accelerate performance -of an email service. In such workload, if cache device is broken but no -dirty data lost, keep the bcache device alive and permit email service -continue to access user data might be a better solution for the cache -device failure. - -Nix <nix@esperi.org.uk> points out the issue and provides the above example -to explain why it might be necessary to not stop bcache device for broken -cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant -suggestion to provide "always" and "auto" options to per-cached device -sysfs file stop_when_cache_set_failed. If cache set is retiring and the -backing device has no dirty data on cache, it should be safe to keep the -bcache device alive. In this case, if stop_when_cache_set_failed is set to -"auto", the device failure handling code will not stop this bcache device -and permit application to access the backing device with a unattached -bcache device. - -Changelog: -v3: fix typos pointed out by Nix. -v2: change option values of stop_when_cache_set_failed from 1/0 to - "auto"/"always". -v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1 - (always stop). - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Michael Lyle <mlyle@lyle.org> -Cc: Nix <nix@esperi.org.uk> -Cc: Pavel Goran <via-bcache@pvgoran.name> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Hannes Reinecke <hare@suse.com> ---- - drivers/md/bcache/bcache.h | 9 +++++ - drivers/md/bcache/super.c | 82 ++++++++++++++++++++++++++++++++++++++++------ - drivers/md/bcache/sysfs.c | 17 ++++++++++ - 3 files changed, 98 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 56179fff1e59..7c2b836732e9 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -287,6 +287,12 @@ struct io { - sector_t last; - }; - -+enum stop_on_failure { -+ BCH_CACHED_DEV_STOP_AUTO = 0, -+ BCH_CACHED_DEV_STOP_ALWAYS, -+ BCH_CACHED_DEV_STOP_MODE_MAX, -+}; -+ - struct cached_dev { - struct list_head list; - struct bcache_device disk; -@@ -379,6 +385,8 @@ struct cached_dev { - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; -+ -+ enum stop_on_failure stop_when_cache_set_failed; - }; - - enum alloc_reserve { -@@ -924,6 +932,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *); - - extern struct workqueue_struct *bcache_wq; - extern const char * const bch_cache_modes[]; -+extern const char * const bch_stop_on_failure_modes[]; - extern struct mutex bch_register_lock; - extern struct list_head bch_cache_sets; - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a1abeebc7643..52d5012948c9 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = { - NULL - }; - -+/* Default is -1; we skip past it for stop_when_cache_set_failed */ -+const char * const bch_stop_on_failure_modes[] = { -+ "default", -+ "auto", -+ "always", -+ NULL -+}; -+ - static struct kobject *bcache_kobj; - struct mutex bch_register_lock; - LIST_HEAD(bch_cache_sets); -@@ -1189,6 +1197,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ /* default to auto */ -+ dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; -+ - bch_cached_dev_request_init(dc); - bch_cached_dev_writeback_init(dc); - return 0; -@@ -1465,25 +1476,76 @@ static void cache_set_flush(struct closure *cl) - closure_return(cl); - } - -+/* -+ * This function is only called when CACHE_SET_IO_DISABLE is set, which means -+ * cache set is unregistering due to too many I/O errors. In this condition, -+ * the bcache device might be stopped, it depends on stop_when_cache_set_failed -+ * value and whether the broken cache has dirty data: -+ * -+ * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device -+ * BCH_CACHED_STOP_AUTO 0 NO -+ * BCH_CACHED_STOP_AUTO 1 YES -+ * BCH_CACHED_DEV_STOP_ALWAYS 0 YES -+ * BCH_CACHED_DEV_STOP_ALWAYS 1 YES -+ * -+ * The expected behavior is, if stop_when_cache_set_failed is configured to -+ * "auto" via sysfs interface, the bcache device will not be stopped if the -+ * backing device is clean on the broken cache device. -+ */ -+static void conditional_stop_bcache_device(struct cache_set *c, -+ struct bcache_device *d, -+ struct cached_dev *dc) -+{ -+ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { -+ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop" -+ " it for failed cache set %pU.", -+ d->disk->disk_name, c->sb.set_uuid); -+ bcache_device_stop(d); -+ } else if (atomic_read(&dc->has_dirty)) { -+ /* -+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO -+ * and dc->has_dirty == 1 -+ */ -+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " -+ "cache is dirty, stop it to avoid potential data " -+ "corruption.", -+ d->disk->disk_name); -+ bcache_device_stop(d); -+ } else { -+ /* -+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO -+ * and dc->has_dirty == 0 -+ */ -+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " -+ "cache is clean, keep it alive.", -+ d->disk->disk_name); -+ } -+} -+ - static void __cache_set_unregister(struct closure *cl) - { - struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc; -+ struct bcache_device *d; - size_t i; - - mutex_lock(&bch_register_lock); - -- for (i = 0; i < c->devices_max_used; i++) -- if (c->devices[i]) { -- if (!UUID_FLASH_ONLY(&c->uuids[i]) && -- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -- dc = container_of(c->devices[i], -- struct cached_dev, disk); -- bch_cached_dev_detach(dc); -- } else { -- bcache_device_stop(c->devices[i]); -- } -+ for (i = 0; i < c->devices_max_used; i++) { -+ d = c->devices[i]; -+ if (!d) -+ continue; -+ -+ if (!UUID_FLASH_ONLY(&c->uuids[i]) && -+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { -+ dc = container_of(d, struct cached_dev, disk); -+ bch_cached_dev_detach(dc); -+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ conditional_stop_bcache_device(c, d, dc); -+ } else { -+ bcache_device_stop(d); - } -+ } - - mutex_unlock(&bch_register_lock); - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index e75279b7d180..f2b3b2686627 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us); - rw_attribute(sequential_cutoff); - rw_attribute(data_csum); - rw_attribute(cache_mode); -+rw_attribute(stop_when_cache_set_failed); - rw_attribute(writeback_metadata); - rw_attribute(writeback_running); - rw_attribute(writeback_percent); -@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev) - bch_cache_modes + 1, - BDEV_CACHE_MODE(&dc->sb)); - -+ if (attr == &sysfs_stop_when_cache_set_failed) -+ return bch_snprint_string_list(buf, PAGE_SIZE, -+ bch_stop_on_failure_modes + 1, -+ dc->stop_when_cache_set_failed); -+ -+ - sysfs_printf(data_csum, "%i", dc->disk.data_csum); - var_printf(verify, "%i"); - var_printf(bypass_torture_test, "%i"); -@@ -247,6 +254,15 @@ STORE(__cached_dev) - } - } - -+ if (attr == &sysfs_stop_when_cache_set_failed) { -+ v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1); -+ -+ if (v < 0) -+ return v; -+ -+ dc->stop_when_cache_set_failed = v; -+ } -+ - if (attr == &sysfs_label) { - if (size > SB_LABEL_SIZE) - return -EINVAL; -@@ -326,6 +342,7 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_data_csum, - #endif - &sysfs_cache_mode, -+ &sysfs_stop_when_cache_set_failed, - &sysfs_writeback_metadata, - &sysfs_writeback_running, - &sysfs_writeback_delay, --- -2.16.1 - diff --git a/for-next/v6/v6-0006-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v6/v6-0006-bcache-fix-inaccurate-io-state-for-detached-bcach.patch deleted file mode 100644 index 8557cd6..0000000 --- a/for-next/v6/v6-0006-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 17928813439cb184ce155145678a916e523f53ae Mon Sep 17 00:00:00 2001 -From: Tang Junhui <tang.junhui@zte.com.cn> -Date: Tue, 9 Jan 2018 10:27:11 +0800 -Subject: [PATCH v6 6/9] bcache: fix inaccurate io state for detached bcache - devices - -When we run IO in a detached device, and run iostat to shows IO status, -normally it will show like bellow (Omitted some fields): -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 -bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 -but after IO stopped, there are still very big avgqu-sz and %util -values as bellow: -Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util -bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 - -The reason for this issue is that, only generic_start_io_acct() called -and no generic_end_io_acct() called for detached device in -cached_dev_make_request(). See the code: -//start generic_start_io_acct() -generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); -if (cached_dev_get(dc)) { - //will callback generic_end_io_acct() -} -else { - //will not call generic_end_io_acct() -} - -This patch calls generic_end_io_acct() in the end of IO for detached -devices, so we can show IO state correctly. - -(Modified to use GFP_NOIO in kzalloc() by Coly Li) - -Changelog: -v2: fix typo. -v1: the initial version. - -Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> -Reviewed-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Reviewed-by: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 51 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 02296bda6384..e09c5ae745be 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) - continue_at(cl, cached_dev_bio_complete, NULL); - } - -+struct detached_dev_io_private { -+ struct bcache_device *d; -+ unsigned long start_time; -+ bio_end_io_t *bi_end_io; -+ void *bi_private; -+}; -+ -+static void detached_dev_end_io(struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ -+ ddip = bio->bi_private; -+ bio->bi_end_io = ddip->bi_end_io; -+ bio->bi_private = ddip->bi_private; -+ -+ generic_end_io_acct(ddip->d->disk->queue, -+ bio_data_dir(bio), -+ &ddip->d->disk->part0, ddip->start_time); -+ -+ kfree(ddip); -+ -+ bio->bi_end_io(bio); -+} -+ -+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) -+{ -+ struct detached_dev_io_private *ddip; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ /* -+ * no need to call closure_get(&dc->disk.cl), -+ * because upper layer had already opened bcache device, -+ * which would call closure_get(&dc->disk.cl) -+ */ -+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); -+ ddip->d = d; -+ ddip->start_time = jiffies; -+ ddip->bi_end_io = bio->bi_end_io; -+ ddip->bi_private = bio->bi_private; -+ bio->bi_end_io = detached_dev_end_io; -+ bio->bi_private = ddip; -+ -+ if ((bio_op(bio) == REQ_OP_DISCARD) && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ bio->bi_end_io(bio); -+ else -+ generic_make_request(bio); -+} -+ - /* Cached devices - read & write stuff */ - - static blk_qc_t cached_dev_make_request(struct request_queue *q, -@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - else - cached_dev_read(dc, s); - } -- } else { -- if ((bio_op(bio) == REQ_OP_DISCARD) && -- !blk_queue_discard(bdev_get_queue(dc->bdev))) -- bio_endio(bio); -- else -- generic_make_request(bio); -- } -+ } else -+ detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; - } --- -2.16.1 - diff --git a/for-next/v6/v6-0007-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v6/v6-0007-bcache-add-backing_request_endio-for-bi_end_io-of.patch deleted file mode 100644 index d634778..0000000 --- a/for-next/v6/v6-0007-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ /dev/null @@ -1,255 +0,0 @@ -From 4990744596fef00323d0d4e4376f147d8aff6db9 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 10 Jan 2018 21:01:48 +0800 -Subject: [PATCH v6 7/9] bcache: add backing_request_endio() for bi_end_io of - attached backing device I/O - -In order to catch I/O error of backing device, a separate bi_end_io -call back is required. Then a per backing device counter can record I/O -errors number and retire the backing device if the counter reaches a -per backing device I/O error limit. - -This patch adds backing_request_endio() to bcache backing device I/O code -path, this is a preparation for further complicated backing device failure -handling. So far there is no real code logic change, I make this change a -separate patch to make sure it is stable and reliable for further work. - -Changelog: -v2: Fix code comments typo, remove a redundant bch_writeback_add() line - added in v4 patch set. -v1: indeed this is new added in this patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Junhui Tang <tang.junhui@zte.com.cn> -Cc: Michael Lyle <mlyle@lyle.org> ---- - drivers/md/bcache/request.c | 93 +++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/super.c | 1 + - drivers/md/bcache/writeback.c | 1 + - 3 files changed, 79 insertions(+), 16 deletions(-) - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index e09c5ae745be..9c6dda3b0068 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) - } - - op->insert_data_done = true; -+ /* get in bch_data_insert() */ - bio_put(bio); - out: - continue_at(cl, bch_data_insert_keys, op->wq); -@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) - closure_put(cl); - } - -+static void backing_request_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ -+ if (bio->bi_status) { -+ struct search *s = container_of(cl, struct search, cl); -+ /* -+ * If a bio has REQ_PREFLUSH for writeback mode, it is -+ * speically assembled in cached_dev_write() for a non-zero -+ * write request which has REQ_PREFLUSH. we don't set -+ * s->iop.status by this failure, the status will be decided -+ * by result of bch_data_insert() operation. -+ */ -+ if (unlikely(s->iop.writeback && -+ bio->bi_opf & REQ_PREFLUSH)) { -+ char buf[BDEVNAME_SIZE]; -+ -+ bio_devname(bio, buf); -+ pr_err("Can't flush %s: returned bi_status %i", -+ buf, bio->bi_status); -+ } else { -+ /* set to orig_bio->bi_status in bio_complete() */ -+ s->iop.status = bio->bi_status; -+ } -+ s->recoverable = false; -+ /* should count I/O error for backing device here */ -+ } -+ -+ bio_put(bio); -+ closure_put(cl); -+} -+ - static void bio_complete(struct search *s) - { - if (s->orig_bio) { -@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) - } - } - --static void do_bio_hook(struct search *s, struct bio *orig_bio) -+static void do_bio_hook(struct search *s, -+ struct bio *orig_bio, -+ bio_end_io_t *end_io_fn) - { - struct bio *bio = &s->bio.bio; - - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); -- bio->bi_end_io = request_endio; -+ /* -+ * bi_end_io can be set separately somewhere else, e.g. the -+ * variants in, -+ * - cache_bio->bi_end_io from cached_dev_cache_miss() -+ * - n->bi_end_io from cache_lookup_fn() -+ */ -+ bio->bi_end_io = end_io_fn; - bio->bi_private = &s->cl; - - bio_cnt_set(bio, 3); -@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, - s = mempool_alloc(d->c->search, GFP_NOIO); - - closure_init(&s->cl, NULL); -- do_bio_hook(s, bio); -+ do_bio_hook(s, bio, request_endio); - - s->orig_bio = bio; - s->cache_miss = NULL; -@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) - trace_bcache_read_retry(s->orig_bio); - - s->iop.status = 0; -- do_bio_hook(s, s->orig_bio); -+ do_bio_hook(s, s->orig_bio, backing_request_endio); - - /* XXX: invalidate cache */ - -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, bio, cl); - } - -@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - bio_copy_dev(cache_bio, miss); - cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - -- cache_bio->bi_end_io = request_endio; -+ cache_bio->bi_end_io = backing_request_endio; - cache_bio->bi_private = &s->cl; - - bch_bio_map(cache_bio, NULL); -@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, - s->cache_miss = miss; - s->iop.bio = cache_bio; - bio_get(cache_bio); -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, cache_bio, &s->cl); - - return ret; - out_put: - bio_put(cache_bio); - out_submit: -- miss->bi_end_io = request_endio; -+ miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); - return ret; - } -@@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) - s->iop.bio = s->orig_bio; - bio_get(s->iop.bio); - -- if ((bio_op(bio) != REQ_OP_DISCARD) || -- blk_queue_discard(bdev_get_queue(dc->bdev))) -- closure_bio_submit(s->iop.c, bio, cl); -+ if (bio_op(bio) == REQ_OP_DISCARD && -+ !blk_queue_discard(bdev_get_queue(dc->bdev))) -+ goto insert_data; -+ -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; -+ closure_bio_submit(s->iop.c, bio, cl); -+ - } else if (s->iop.writeback) { - bch_writeback_add(dc); - s->iop.bio = bio; - - if (bio->bi_opf & REQ_PREFLUSH) { -- /* Also need to send a flush to the backing device */ -- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, -- dc->disk.bio_split); -- -+ /* -+ * Also need to send a flush to the backing -+ * device. -+ */ -+ struct bio *flush; -+ -+ flush = bio_alloc_bioset(GFP_NOIO, 0, -+ dc->disk.bio_split); -+ if (!flush) { -+ s->iop.status = BLK_STS_RESOURCE; -+ goto insert_data; -+ } - bio_copy_dev(flush, bio); -- flush->bi_end_io = request_endio; -+ flush->bi_end_io = backing_request_endio; - flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; -- -+ /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, flush, cl); - } - } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); -- -+ /* I/O request sent to backing device */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - } - -+insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); - } -@@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl) - bch_journal_meta(s->iop.c, cl); - - /* If it's a flush, we send the flush to the backing device too */ -+ bio->bi_end_io = backing_request_endio; - closure_bio_submit(s->iop.c, bio, cl); - - continue_at(cl, cached_dev_bio_complete, NULL); -@@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - cached_dev_read(dc, s); - } - } else -+ /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 52d5012948c9..93b39e7ae63f 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -273,6 +273,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) - bio->bi_private = dc; - - closure_get(cl); -+ /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); - - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 3d7d8452e0de..4ebe0119ea7e 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; - -+ /* I/O request sent to backing device */ - closure_bio_submit(io->dc->disk.c, &io->bio, cl); - } - --- -2.16.1 - diff --git a/for-next/v6/v6-0008-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v6/v6-0008-bcache-add-io_disable-to-struct-cached_dev.patch deleted file mode 100644 index 01d336d..0000000 --- a/for-next/v6/v6-0008-bcache-add-io_disable-to-struct-cached_dev.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 230e827cd2ed1d6fedda9e1f19367ffc9562c4d1 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 8 Feb 2018 23:10:27 +0800 -Subject: [PATCH v6 8/9] bcache: add io_disable to struct cached_dev - -If a bcache device is configured to writeback mode, current code does not -handle write I/O errors on backing devices properly. - -In writeback mode, write request is written to cache device, and -latter being flushed to backing device. If I/O failed when writing from -cache device to the backing device, bcache code just ignores the error and -upper layer code is NOT noticed that the backing device is broken. - -This patch tries to handle backing device failure like how the cache device -failure is handled, -- Add a error counter 'io_errors' and error limit 'error_limit' in struct - cached_dev. Add another io_disable to struct cached_dev to disable I/Os - on the problematic backing device. -- When I/O error happens on backing device, increase io_errors counter. And - if io_errors reaches error_limit, set cache_dev->io_disable to true, and - stop the bcache device. - -The result is, if backing device is broken of disconnected, and I/O errors -reach its error limit, backing device will be disabled and the associated -bcache device will be removed from system. - -Changelog: -v2: remove "bcache: " prefix in pr_error(), and use correct name string to - print out bcache device gendisk name. -v1: indeed this is new added in v2 patch set. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 6 ++++++ - drivers/md/bcache/io.c | 14 ++++++++++++++ - drivers/md/bcache/request.c | 14 ++++++++++++-- - drivers/md/bcache/super.c | 21 +++++++++++++++++++++ - drivers/md/bcache/sysfs.c | 15 ++++++++++++++- - 5 files changed, 67 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 7c2b836732e9..dbc4fb48c754 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -366,6 +366,7 @@ struct cached_dev { - unsigned sequential_cutoff; - unsigned readahead; - -+ unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; - -@@ -387,6 +388,9 @@ struct cached_dev { - unsigned writeback_rate_minimum; - - enum stop_on_failure stop_when_cache_set_failed; -+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 -+ atomic_t io_errors; -+ unsigned error_limit; - }; - - enum alloc_reserve { -@@ -896,6 +900,7 @@ static inline void closure_bio_submit(struct cache_set *c, - - /* Forward declarations */ - -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); - void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); - void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -@@ -923,6 +928,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); - bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); -+bool bch_cached_dev_error(struct cached_dev *dc); - - __printf(2, 3) - bool bch_cache_set_error(struct cache_set *, const char *, ...); -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index 8013ecbcdbda..7fac97ae036e 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, - } - - /* IO errors */ -+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) -+{ -+ char buf[BDEVNAME_SIZE]; -+ unsigned errors; -+ -+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); -+ -+ errors = atomic_add_return(1, &dc->io_errors); -+ if (errors < dc->error_limit) -+ pr_err("%s: IO error on backing device, unrecoverable", -+ bio_devname(bio, buf)); -+ else -+ bch_cached_dev_error(dc); -+} - - void bch_count_io_errors(struct cache *ca, - blk_status_t error, -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 9c6dda3b0068..03245e6980a6 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) - - if (bio->bi_status) { - struct search *s = container_of(cl, struct search, cl); -+ struct cached_dev *dc = container_of(s->d, -+ struct cached_dev, disk); - /* - * If a bio has REQ_PREFLUSH for writeback mode, it is - * speically assembled in cached_dev_write() for a non-zero -@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) - } - s->recoverable = false; - /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); - } - - bio_put(bio); -@@ -1065,8 +1068,14 @@ static void detatched_dev_end_io(struct bio *bio) - bio_data_dir(bio), - &ddip->d->disk->part0, ddip->start_time); - -- kfree(ddip); -+ if (bio->bi_status) { -+ struct cached_dev *dc = container_of(ddip->d, -+ struct cached_dev, disk); -+ /* should count I/O error for backing device here */ -+ bch_count_backing_io_errors(dc, bio); -+ } - -+ kfree(ddip); - bio->bi_end_io(bio); - } - -@@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - int rw = bio_data_dir(bio); - -- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { -+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -+ dc->io_disable)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 93b39e7ae63f..1c5b7074bd6c 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1198,6 +1198,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - -+ atomic_set(&dc->io_errors, 0); -+ dc->io_disable = false; -+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; - /* default to auto */ - dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; - -@@ -1352,6 +1355,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - return flash_dev_run(c, u); - } - -+bool bch_cached_dev_error(struct cached_dev *dc) -+{ -+ char name[BDEVNAME_SIZE]; -+ -+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) -+ return false; -+ -+ dc->io_disable = true; -+ /* make others know io_disable is true earlier */ -+ smp_mb(); -+ -+ pr_err("stop %s: too many IO errors on backing device %s\n", -+ dc->disk.disk->disk_name, bdevname(dc->bdev, name)); -+ -+ bcache_device_stop(&dc->disk); -+ return true; -+} -+ - /* Cache set */ - - __printf(2, 3) -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index f2b3b2686627..bd40d9d0a969 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -141,7 +141,9 @@ SHOW(__bch_cached_dev) - var_print(writeback_delay); - var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); -- -+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_error_limit, "%i", dc->error_limit); -+ sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -232,6 +234,14 @@ STORE(__cached_dev) - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - -+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -+ -+ if (attr == &sysfs_io_disable) { -+ int v = strtoul_or_return(buf); -+ -+ dc->io_disable = v ? 1 : 0; -+ } -+ - d_strtoi_h(sequential_cutoff); - d_strtoi_h(readahead); - -@@ -352,6 +362,9 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_i_term_inverse, - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_debug, -+ &sysfs_errors, -+ &sysfs_io_error_limit, -+ &sysfs_io_disable, - &sysfs_dirty_data, - &sysfs_stripe_size, - &sysfs_partial_stripes_expensive, --- -2.16.1 - diff --git a/for-next/v6/v6-0009-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v6/v6-0009-bcache-stop-bcache-device-when-backing-device-is-.patch deleted file mode 100644 index bf2af00..0000000 --- a/for-next/v6/v6-0009-bcache-stop-bcache-device-when-backing-device-is-.patch +++ /dev/null @@ -1,152 +0,0 @@ -From 15a23aafbcfd8e92fc7e1740c8e53d9c64c9fde1 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 5 Feb 2018 23:52:40 +0800 -Subject: [PATCH v6 9/9] bcache: stop bcache device when backing device is - offline - -Currently bcache does not handle backing device failure, if backing -device is offline and disconnected from system, its bcache device can still -be accessible. If the bcache device is in writeback mode, I/O requests even -can success if the requests hit on cache device. That is to say, when and -how bcache handles offline backing device is undefined. - -This patch tries to handle backing device offline in a rather simple way, -- Add cached_dev->status_update_thread kernel thread to update backing - device status in every 1 second. -- Add cached_dev->offline_seconds to record how many seconds the backing - device is observed to be offline. If the backing device is offline for - BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and - call bcache_device_stop() to stop the bache device which linked to the - offline backing device. - -Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, -its bcache device will be removed, then user space application writing on -it will get error immediately, and handler the device failure in time. - -This patch is quite simple, does not handle more complicated situations. -Once the bcache device is stopped, users need to recovery the backing -device, register and attach it manually. - -Changelog: -v2: remove "bcache: " prefix when calling pr_warn(). -v1: initial version. - -Signed-off-by: Coly Li <colyli@suse.de> -Reviewed-by: Hannes Reinecke <hare@suse.com> -Cc: Michael Lyle <mlyle@lyle.org> -Cc: Junhui Tang <tang.junhui@zte.com.cn> ---- - drivers/md/bcache/bcache.h | 2 ++ - drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 57 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index dbc4fb48c754..e465a661f32e 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -344,6 +344,7 @@ struct cached_dev { - - struct keybuf writeback_keys; - -+ struct task_struct *status_update_thread; - /* - * Order the write-half of writeback operations strongly in dispatch - * order. (Maintain LBA order; don't allow reads completing out of -@@ -391,6 +392,7 @@ struct cached_dev { - #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 - atomic_t io_errors; - unsigned error_limit; -+ unsigned offline_seconds; - }; - - enum alloc_reserve { -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 1c5b7074bd6c..ea25cef924ff 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -654,6 +654,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, - unsigned int cmd, unsigned long arg) - { - struct bcache_device *d = b->bd_disk->private_data; -+ struct cached_dev *dc = container_of(d, struct cached_dev, disk); -+ -+ if (dc->io_disable) -+ return -EIO; -+ - return d->ioctl(d, mode, cmd, arg); - } - -@@ -864,6 +869,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) - c->cached_dev_sectors = sectors; - } - -+#define BACKING_DEV_OFFLINE_TIMEOUT 5 -+static int cached_dev_status_update(void *arg) -+{ -+ struct cached_dev *dc = arg; -+ struct request_queue *q; -+ char buf[BDEVNAME_SIZE]; -+ -+ /* -+ * If this delayed worker is stopping outside, directly quit here. -+ * dc->io_disable might be set via sysfs interface, so check it -+ * here too. -+ */ -+ while (!kthread_should_stop() && !dc->io_disable) { -+ q = bdev_get_queue(dc->bdev); -+ if (blk_queue_dying(q)) -+ dc->offline_seconds++; -+ else -+ dc->offline_seconds = 0; -+ -+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { -+ pr_err("%s: device offline for %d seconds", -+ bdevname(dc->bdev, buf), -+ BACKING_DEV_OFFLINE_TIMEOUT); -+ pr_err("%s: disable I/O request due to backing " -+ "device offline", dc->disk.name); -+ dc->io_disable = true; -+ /* let others know earlier that io_disable is true */ -+ smp_mb(); -+ bcache_device_stop(&dc->disk); -+ break; -+ } -+ -+ schedule_timeout_interruptible(HZ); -+ } -+ -+ dc->status_update_thread = NULL; -+ return 0; -+} -+ - void bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; -@@ -906,6 +950,15 @@ void bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); -+ -+ dc->status_update_thread = kthread_run(cached_dev_status_update, -+ dc, -+ "bcache_status_update"); -+ if (IS_ERR(dc->status_update_thread)) { -+ pr_warn("failed to create bcache_status_update kthread, " -+ "continue to run without monitoring backing " -+ "device status"); -+ } - } - - /* -@@ -1128,6 +1181,8 @@ static void cached_dev_free(struct closure *cl) - kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); -+ if (!IS_ERR_OR_NULL(dc->status_update_thread)) -+ kthread_stop(dc->status_update_thread); - - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); --- -2.16.1 - |