diff options
author | Coly Li <colyli@suse.de> | 2019-08-15 00:57:56 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2019-08-15 00:57:56 +0800 |
commit | d38cc59c844c41b3e53b35735298dcca7b82799e (patch) | |
tree | d174217b6c1339a7fbe95669ad96fa7df4d240f9 | |
parent | f7c4bd7d9a005cbd47c475b09227695ddfe87eb0 (diff) | |
download | bcache-patches-d38cc59c844c41b3e53b35735298dcca7b82799e.tar.gz |
update for-next and for-test
45 files changed, 610 insertions, 2989 deletions
diff --git a/for-current/0000-cover-letter.patch b/for-current/0000-cover-letter.patch deleted file mode 100644 index 506d7d6..0000000 --- a/for-current/0000-cover-letter.patch +++ /dev/null @@ -1,92 +0,0 @@ -From ea4bf18c9eb2ef705dce00b1bc5fde2f49ef2740 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 27 Jun 2019 23:29:25 +0800 -Subject: [PATCH 00/37] bcache patches for Linux v5.3 - -Hi Jens, - -Here are the bcache patches for Linux v5.3. All these patches are -tested for a while and survived from my smoking and pressure testings. - -This run we have Alexandru Ardelean contributes a clean up patch. The -rested patches are from me, there is an important race fix has the -following patches involved in, -- bcache: Revert "bcache: free heap cache_set->flush_btree in - bch_journal_free" -- bcache: Revert "bcache: fix high CPU occupancy during journal" -- bcache: remove retry_flush_write from struct cache_set -- bcache: fix race in btree_flush_write() -- bcache: performance improvement for btree_flush_write() -- bcache: add reclaimed_journal_buckets to struct cache_set -On a Lenovo SR650 server (48 cores, 200G dram, 1T NVMe SSD as cache -device and 12T NVMe SSD as backing device), without this fix, bcache -can only run 40 around minutes before deadlock or panic happens. Now -I don't observe any deadlock or panic for 5+ hours smoking test. - -Please pick them for Linux v5.3, and thank you in advance. - -Coly Li ---- - -Alexandru Ardelean (1): - bcache: use sysfs_match_string() instead of __sysfs_match_string() - -Coly Li (36): - bcache: don't set max writeback rate if gc is running - bcache: check c->gc_thread by IS_ERR_OR_NULL in cache_set_flush() - bcache: fix return value error in bch_journal_read() - Revert "bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()" - bcache: avoid flushing btree node in cache_set_flush() if io disabled - bcache: ignore read-ahead request failure on backing device - bcache: add io error counting in write_bdev_super_endio() - bcache: remove unnecessary prefetch() in bset_search_tree() - bcache: add return value check to bch_cached_dev_run() - bcache: remove unncessary code in bch_btree_keys_init() - bcache: check CACHE_SET_IO_DISABLE in allocator code - bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() - bcache: more detailed error message to bcache_device_link() - bcache: add more error message in bch_cached_dev_attach() - bcache: improve error message in bch_cached_dev_run() - bcache: remove "XXX:" comment line from run_cache_set() - bcache: make bset_search_tree() be more understandable - bcache: add pendings_cleanup to stop pending bcache device - bcache: fix mistaken sysfs entry for io_error counter - bcache: destroy dc->writeback_write_wq if failed to create - dc->writeback_thread - bcache: stop writeback kthread and kworker when bch_cached_dev_run() - failed - bcache: avoid a deadlock in bcache_reboot() - bcache: acquire bch_register_lock later in cached_dev_detach_finish() - bcache: acquire bch_register_lock later in cached_dev_free() - bcache: fix potential deadlock in cached_def_free() - bcache: add code comments for journal_read_bucket() - bcache: set largest seq to ja->seq[bucket_index] in - journal_read_bucket() - bcache: shrink btree node cache after bch_btree_check() - bcache: Revert "bcache: free heap cache_set->flush_btree in - bch_journal_free" - bcache: Revert "bcache: fix high CPU occupancy during journal" - bcache: only clear BTREE_NODE_dirty bit when it is set - bcache: add comments for mutex_lock(&b->write_lock) - bcache: remove retry_flush_write from struct cache_set - bcache: fix race in btree_flush_write() - bcache: performance improvement for btree_flush_write() - bcache: add reclaimed_journal_buckets to struct cache_set - - drivers/md/bcache/alloc.c | 9 ++ - drivers/md/bcache/bcache.h | 6 +- - drivers/md/bcache/bset.c | 61 ++++-------- - drivers/md/bcache/btree.c | 53 ++++++++-- - drivers/md/bcache/btree.h | 2 + - drivers/md/bcache/io.c | 12 +++ - drivers/md/bcache/journal.c | 141 ++++++++++++++++++-------- - drivers/md/bcache/journal.h | 4 + - drivers/md/bcache/super.c | 227 ++++++++++++++++++++++++++++++++++-------- - drivers/md/bcache/sysfs.c | 67 +++++++++---- - drivers/md/bcache/util.h | 2 - - drivers/md/bcache/writeback.c | 8 ++ - 12 files changed, 432 insertions(+), 160 deletions(-) - --- -2.16.4 - diff --git a/for-current/0001-bcache-don-t-set-max-writeback-rate-if-gc-is-running.patch b/for-current/0001-bcache-don-t-set-max-writeback-rate-if-gc-is-running.patch deleted file mode 100644 index 1cdab6a..0000000 --- a/for-current/0001-bcache-don-t-set-max-writeback-rate-if-gc-is-running.patch +++ /dev/null @@ -1,41 +0,0 @@ -From e58f5f253e35ac1ccbe0dd4db2b71783a913c79b Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 21 Jun 2019 01:46:20 +0800 -Subject: [PATCH 01/37] bcache: don't set max writeback rate if gc is running - -When gc is running, user space I/O processes may wait inside -bcache code, so no new I/O coming. Indeed this is not a real idle -time, maximum writeback rate should not be set in such situation. -Otherwise a faster writeback thread may compete locks with gc thread -and makes garbage collection slower, which results a longer I/O -freeze period. - -This patch checks c->gc_mark_valid in set_at_max_writeback_rate(). If -c->gc_mark_valid is 0 (gc running), set_at_max_writeback_rate() returns -false, then update_writeback_rate() will not set writeback rate to -maximum value even c->idle_counter reaches an idle threshold. - -Now writeback thread won't interfere gc thread performance. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/writeback.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 73f0efac2b9f..262f7ef20992 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc) - static bool set_at_max_writeback_rate(struct cache_set *c, - struct cached_dev *dc) - { -+ /* Don't set max writeback rate if gc is running */ -+ if (!c->gc_mark_valid) -+ return false; - /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have --- -2.16.4 - diff --git a/for-current/0002-bcache-check-c-gc_thread-by-IS_ERR_OR_NULL-in-cache_.patch b/for-current/0002-bcache-check-c-gc_thread-by-IS_ERR_OR_NULL-in-cache_.patch deleted file mode 100644 index 1166f2c..0000000 --- a/for-current/0002-bcache-check-c-gc_thread-by-IS_ERR_OR_NULL-in-cache_.patch +++ /dev/null @@ -1,125 +0,0 @@ -From 4f14821794c4cacb35e305ea347095e8ae70d871 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 22 Jun 2019 23:04:36 +0800 -Subject: [PATCH 02/37] bcache: check c->gc_thread by IS_ERR_OR_NULL in - cache_set_flush() - -When system memory is in heavy pressure, bch_gc_thread_start() from -run_cache_set() may fail due to out of memory. In such condition, -c->gc_thread is assigned to -ENOMEM, not NULL pointer. Then in following -failure code path bch_cache_set_error(), when cache_set_flush() gets -called, the code piece to stop c->gc_thread is broken, - if (!IS_ERR_OR_NULL(c->gc_thread)) - kthread_stop(c->gc_thread); - -And KASAN catches such NULL pointer deference problem, with the warning -information: - -[ 561.207881] ================================================================== -[ 561.207900] BUG: KASAN: null-ptr-deref in kthread_stop+0x3b/0x440 -[ 561.207904] Write of size 4 at addr 000000000000001c by task kworker/15:1/313 - -[ 561.207913] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G W 5.0.0-vanilla+ #3 -[ 561.207916] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 -[ 561.207935] Workqueue: events cache_set_flush [bcache] -[ 561.207940] Call Trace: -[ 561.207948] dump_stack+0x9a/0xeb -[ 561.207955] ? kthread_stop+0x3b/0x440 -[ 561.207960] ? kthread_stop+0x3b/0x440 -[ 561.207965] kasan_report+0x176/0x192 -[ 561.207973] ? kthread_stop+0x3b/0x440 -[ 561.207981] kthread_stop+0x3b/0x440 -[ 561.207995] cache_set_flush+0xd4/0x6d0 [bcache] -[ 561.208008] process_one_work+0x856/0x1620 -[ 561.208015] ? find_held_lock+0x39/0x1d0 -[ 561.208028] ? drain_workqueue+0x380/0x380 -[ 561.208048] worker_thread+0x87/0xb80 -[ 561.208058] ? __kthread_parkme+0xb6/0x180 -[ 561.208067] ? process_one_work+0x1620/0x1620 -[ 561.208072] kthread+0x326/0x3e0 -[ 561.208079] ? kthread_create_worker_on_cpu+0xc0/0xc0 -[ 561.208090] ret_from_fork+0x3a/0x50 -[ 561.208110] ================================================================== -[ 561.208113] Disabling lock debugging due to kernel taint -[ 561.208115] irq event stamp: 11800231 -[ 561.208126] hardirqs last enabled at (11800231): [<ffffffff83008538>] do_syscall_64+0x18/0x410 -[ 561.208127] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c -[ 561.208129] #PF error: [WRITE] -[ 561.312253] hardirqs last disabled at (11800230): [<ffffffff830052ff>] trace_hardirqs_off_thunk+0x1a/0x1c -[ 561.312259] softirqs last enabled at (11799832): [<ffffffff850005c7>] __do_softirq+0x5c7/0x8c3 -[ 561.405975] PGD 0 P4D 0 -[ 561.442494] softirqs last disabled at (11799821): [<ffffffff831add2c>] irq_exit+0x1ac/0x1e0 -[ 561.791359] Oops: 0002 [#1] SMP KASAN NOPTI -[ 561.791362] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G B W 5.0.0-vanilla+ #3 -[ 561.791363] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 -[ 561.791371] Workqueue: events cache_set_flush [bcache] -[ 561.791374] RIP: 0010:kthread_stop+0x3b/0x440 -[ 561.791376] Code: 00 00 65 8b 05 26 d5 e0 7c 89 c0 48 0f a3 05 ec aa df 02 0f 82 dc 02 00 00 4c 8d 63 20 be 04 00 00 00 4c 89 e7 e8 65 c5 53 00 <f0> ff 43 20 48 8d 7b 24 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 -[ 561.791377] RSP: 0018:ffff88872fc8fd10 EFLAGS: 00010286 -[ 561.838895] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838916] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838934] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838948] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838966] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838979] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 561.838996] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 563.067028] RAX: 0000000000000000 RBX: fffffffffffffffc RCX: ffffffff832dd314 -[ 563.067030] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000297 -[ 563.067032] RBP: ffff88872fc8fe88 R08: fffffbfff0b8213d R09: fffffbfff0b8213d -[ 563.067034] R10: 0000000000000001 R11: fffffbfff0b8213c R12: 000000000000001c -[ 563.408618] R13: ffff88dc61cc0f68 R14: ffff888102b94900 R15: ffff88dc61cc0f68 -[ 563.408620] FS: 0000000000000000(0000) GS:ffff888f7dc00000(0000) knlGS:0000000000000000 -[ 563.408622] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 -[ 563.408623] CR2: 000000000000001c CR3: 0000000f48a1a004 CR4: 00000000007606e0 -[ 563.408625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 -[ 563.408627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 -[ 563.904795] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 563.915796] PKRU: 55555554 -[ 563.915797] Call Trace: -[ 563.915807] cache_set_flush+0xd4/0x6d0 [bcache] -[ 563.915812] process_one_work+0x856/0x1620 -[ 564.001226] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 564.033563] ? find_held_lock+0x39/0x1d0 -[ 564.033567] ? drain_workqueue+0x380/0x380 -[ 564.033574] worker_thread+0x87/0xb80 -[ 564.062823] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 564.118042] ? __kthread_parkme+0xb6/0x180 -[ 564.118046] ? process_one_work+0x1620/0x1620 -[ 564.118048] kthread+0x326/0x3e0 -[ 564.118050] ? kthread_create_worker_on_cpu+0xc0/0xc0 -[ 564.167066] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 564.252441] ret_from_fork+0x3a/0x50 -[ 564.252447] Modules linked in: msr rpcrdma sunrpc rdma_ucm ib_iser ib_umad rdma_cm ib_ipoib i40iw configfs iw_cm ib_cm libiscsi scsi_transport_iscsi mlx4_ib ib_uverbs mlx4_en ib_core nls_iso8859_1 nls_cp437 vfat fat intel_rapl skx_edac x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ses raid0 aesni_intel cdc_ether enclosure usbnet ipmi_ssif joydev aes_x86_64 i40e scsi_transport_sas mii bcache md_mod crypto_simd mei_me ioatdma crc64 ptp cryptd pcspkr i2c_i801 mlx4_core glue_helper pps_core mei lpc_ich dca wmi ipmi_si ipmi_devintf nd_pmem dax_pmem nd_btt ipmi_msghandler device_dax pcc_cpufreq button hid_generic usbhid mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect xhci_pci sysimgblt fb_sys_fops xhci_hcd ttm megaraid_sas drm usbcore nfit libnvdimm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs -[ 564.299390] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. -[ 564.348360] CR2: 000000000000001c -[ 564.348362] ---[ end trace b7f0e5cc7b2103b0 ]--- - -Therefore, it is not enough to only check whether c->gc_thread is NULL, -we should use IS_ERR_OR_NULL() to check both NULL pointer and error -value. - -This patch changes the above buggy code piece in this way, - if (!IS_ERR_OR_NULL(c->gc_thread)) - kthread_stop(c->gc_thread); - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 1b63ac876169..64d9de89a63f 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1564,7 +1564,7 @@ static void cache_set_flush(struct closure *cl) - kobject_put(&c->internal); - kobject_del(&c->kobj); - -- if (c->gc_thread) -+ if (!IS_ERR_OR_NULL(c->gc_thread)) - kthread_stop(c->gc_thread); - - if (!IS_ERR_OR_NULL(c->root)) --- -2.16.4 - diff --git a/for-current/0003-bcache-fix-return-value-error-in-bch_journal_read.patch b/for-current/0003-bcache-fix-return-value-error-in-bch_journal_read.patch deleted file mode 100644 index 75a6fd1..0000000 --- a/for-current/0003-bcache-fix-return-value-error-in-bch_journal_read.patch +++ /dev/null @@ -1,42 +0,0 @@ -From b941c10de9c1619ef8598663123ac0e637e23b72 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 4 Jun 2019 14:43:08 +0800 -Subject: [PATCH 03/37] bcache: fix return value error in bch_journal_read() - -When everything is OK in bch_journal_read(), finally the return value -is returned by, - return ret; -which assumes ret will be 0 here. This assumption is wrong when all -journal buckets as are full and filled with valid journal entries. In -such cache the last location referencess read_bucket() sets 'ret' to -1, which means new jset added into jset list. The jset list is list -'journal' in caller run_cache_set(). - -Return 1 to run_cache_set() means something wrong and the cache set -won't start, but indeed everything is OK. - -This patch changes the line at end of bch_journal_read() to directly -return 0 since everything if verything is good. Then a bogus error -is fixed. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 12dae9348147..4e5fc05720fc 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -268,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - struct journal_replay, - list)->j.seq; - -- return ret; -+ return 0; - #undef read_bucket - } - --- -2.16.4 - diff --git a/for-current/0004-Revert-bcache-set-CACHE_SET_IO_DISABLE-in-bch_cached.patch b/for-current/0004-Revert-bcache-set-CACHE_SET_IO_DISABLE-in-bch_cached.patch deleted file mode 100644 index 13f5fe1..0000000 --- a/for-current/0004-Revert-bcache-set-CACHE_SET_IO_DISABLE-in-bch_cached.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 89e69ecae72e064bab278cdee6d391f5fcb732b3 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 4 Jun 2019 15:00:46 +0800 -Subject: [PATCH 04/37] Revert "bcache: set CACHE_SET_IO_DISABLE in - bch_cached_dev_error()" - -This reverts commit 6147305c73e4511ca1a975b766b97a779d442567. - -Although this patch helps the failed bcache device to stop faster when -too many I/O errors detected on corresponding cached device, setting -CACHE_SET_IO_DISABLE bit to cache set c->flags was not a good idea. This -operation will disable all I/Os on cache set, which means other attached -bcache devices won't work neither. - -Without this patch, the failed bcache device can also be stopped -eventually if internal I/O accomplished (e.g. writeback). Therefore here -I revert it. - -Fixes: 6147305c73e4 ("bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()") -Reported-by: Yong Li <mr.liyong@qq.com> -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org ---- - drivers/md/bcache/super.c | 17 ----------------- - 1 file changed, 17 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 64d9de89a63f..ba2ad093bc80 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1437,8 +1437,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) - - bool bch_cached_dev_error(struct cached_dev *dc) - { -- struct cache_set *c; -- - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) - return false; - -@@ -1449,21 +1447,6 @@ bool bch_cached_dev_error(struct cached_dev *dc) - pr_err("stop %s: too many IO errors on backing device %s\n", - dc->disk.disk->disk_name, dc->backing_dev_name); - -- /* -- * If the cached device is still attached to a cache set, -- * even dc->io_disable is true and no more I/O requests -- * accepted, cache device internal I/O (writeback scan or -- * garbage collection) may still prevent bcache device from -- * being stopped. So here CACHE_SET_IO_DISABLE should be -- * set to c->flags too, to make the internal I/O to cache -- * device rejected and stopped immediately. -- * If c is NULL, that means the bcache device is not attached -- * to any cache set, then no CACHE_SET_IO_DISABLE bit to set. -- */ -- c = dc->disk.c; -- if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) -- pr_info("CACHE_SET_IO_DISABLE already set"); -- - bcache_device_stop(&dc->disk); - return true; - } --- -2.16.4 - diff --git a/for-current/0005-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch b/for-current/0005-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch deleted file mode 100644 index 2df04c2..0000000 --- a/for-current/0005-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch +++ /dev/null @@ -1,53 +0,0 @@ -From fc406bc07ad1e5718f3be439111f95001a2bcf9c Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 23 May 2019 23:18:10 +0800 -Subject: [PATCH 05/37] bcache: avoid flushing btree node in cache_set_flush() - if io disabled - -When cache_set_flush() is called for too many I/O errors detected on -cache device and the cache set is retiring, inside the function it -doesn't make sense to flushing cached btree nodes from c->btree_cache -because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os -onto cache device will be rejected. - -This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE -is set. If yes, then avoids to flush the cached btree nodes to reduce -more time and make cache set retiring more faster. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 18 +++++++++++------- - 1 file changed, 11 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index ba2ad093bc80..dc6702c2c4b6 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1553,13 +1553,17 @@ static void cache_set_flush(struct closure *cl) - if (!IS_ERR_OR_NULL(c->root)) - list_add(&c->root->list, &c->btree_cache); - -- /* Should skip this if we're unregistering because of an error */ -- list_for_each_entry(b, &c->btree_cache, list) { -- mutex_lock(&b->write_lock); -- if (btree_node_dirty(b)) -- __bch_btree_node_write(b, NULL); -- mutex_unlock(&b->write_lock); -- } -+ /* -+ * Avoid flushing cached nodes if cache set is retiring -+ * due to too many I/O errors detected. -+ */ -+ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) -+ list_for_each_entry(b, &c->btree_cache, list) { -+ mutex_lock(&b->write_lock); -+ if (btree_node_dirty(b)) -+ __bch_btree_node_write(b, NULL); -+ mutex_unlock(&b->write_lock); -+ } - - for_each_cache(ca, c, i) - if (ca->alloc_thread) --- -2.16.4 - diff --git a/for-current/0006-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-current/0006-bcache-ignore-read-ahead-request-failure-on-backing-.patch deleted file mode 100644 index b43866a..0000000 --- a/for-current/0006-bcache-ignore-read-ahead-request-failure-on-backing-.patch +++ /dev/null @@ -1,56 +0,0 @@ -From b864955e61393f70425c704ff2f16df72f508eb9 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 13 May 2019 22:48:09 +0800 -Subject: [PATCH 06/37] bcache: ignore read-ahead request failure on backing - device - -When md raid device (e.g. raid456) is used as backing device, read-ahead -requests on a degrading and recovering md raid device might be failured -immediately by md raid code, but indeed this md raid array can still be -read or write for normal I/O requests. Therefore such failed read-ahead -request are not real hardware failure. Further more, after degrading and -recovering accomplished, read-ahead requests will be handled by md raid -array again. - -For such condition, I/O failures of read-ahead requests don't indicate -real health status (because normal I/O still be served), they should not -be counted into I/O error counter dc->io_errors. - -Since there is no simple way to detect whether the backing divice is a -md raid device, this patch simply ignores I/O failures for read-ahead -bios on backing device, to avoid bogus backing device failure on a -degrading md raid array. - -Suggested-and-tested-by: Thorsten Knabe <linux@thorsten-knabe.de> -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org ---- - drivers/md/bcache/io.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c -index c25097968319..4d93f07f63e5 100644 ---- a/drivers/md/bcache/io.c -+++ b/drivers/md/bcache/io.c -@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) - - WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); - -+ /* -+ * Read-ahead requests on a degrading and recovering md raid -+ * (e.g. raid6) device might be failured immediately by md -+ * raid code, which is not a real hardware media failure. So -+ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. -+ */ -+ if (bio->bi_opf & REQ_RAHEAD) { -+ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", -+ dc->backing_dev_name); -+ return; -+ } -+ - errors = atomic_add_return(1, &dc->io_errors); - if (errors < dc->error_limit) - pr_err("%s: IO error on backing device, unrecoverable", --- -2.16.4 - diff --git a/for-current/0007-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-current/0007-bcache-add-io-error-counting-in-write_bdev_super_end.patch deleted file mode 100644 index c4e916c..0000000 --- a/for-current/0007-bcache-add-io-error-counting-in-write_bdev_super_end.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 9776d8bfe0f0706004cdb083e5954aec718aa931 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 13 May 2019 23:42:39 +0800 -Subject: [PATCH 07/37] bcache: add io error counting in - write_bdev_super_endio() - -When backing device super block is written by bch_write_bdev_super(), -the bio complete callback write_bdev_super_endio() simply ignores I/O -status. Indeed such write request also contribute to backing device -health status if the request failed. - -This patch checkes bio->bi_status in write_bdev_super_endio(), if there -is error, bch_count_backing_io_errors() will be called to count an I/O -error to dc->io_errors. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index dc6702c2c4b6..73466bda12a7 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, - static void write_bdev_super_endio(struct bio *bio) - { - struct cached_dev *dc = bio->bi_private; -- /* XXX: error checking */ -+ -+ if (bio->bi_status) -+ bch_count_backing_io_errors(dc, bio); - - closure_put(&dc->sb_write); - } --- -2.16.4 - diff --git a/for-current/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-current/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch deleted file mode 100644 index ce26ff1..0000000 --- a/for-current/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 041674ba27a752cf1a14cc6564dbc436b3b11b51 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 14 May 2019 22:23:35 +0800 -Subject: [PATCH 08/37] bcache: remove unnecessary prefetch() in - bset_search_tree() - -In function bset_search_tree(), when p >= t->size, t->tree[0] will be -prefetched by the following code piece, - 974 unsigned int p = n << 4; - 975 - 976 p &= ((int) (p - t->size)) >> 31; - 977 - 978 prefetch(&t->tree[p]); - -The purpose of the above code is to avoid a branch instruction, but -when p >= t->size, prefetch(&t->tree[0]) has no positive performance -contribution at all. This patch avoids the unncessary prefetch by only -calling prefetch() when p < t->size. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bset.c | 16 ++-------------- - 1 file changed, 2 insertions(+), 14 deletions(-) - -diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c -index 268f1b685084..e36a108d3648 100644 ---- a/drivers/md/bcache/bset.c -+++ b/drivers/md/bcache/bset.c -@@ -970,22 +970,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, - unsigned int inorder, j, n = 1; - - do { -- /* -- * A bit trick here. -- * If p < t->size, (int)(p - t->size) is a minus value and -- * the most significant bit is set, right shifting 31 bits -- * gets 1. If p >= t->size, the most significant bit is -- * not set, right shifting 31 bits gets 0. -- * So the following 2 lines equals to -- * if (p >= t->size) -- * p = 0; -- * but a branch instruction is avoided. -- */ - unsigned int p = n << 4; - -- p &= ((int) (p - t->size)) >> 31; -- -- prefetch(&t->tree[p]); -+ if (p < t->size) -+ prefetch(&t->tree[p]); - - j = n; - f = &t->tree[j]; --- -2.16.4 - diff --git a/for-current/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch b/for-current/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch deleted file mode 100644 index b21d257..0000000 --- a/for-current/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch +++ /dev/null @@ -1,97 +0,0 @@ -From 44567cf0d395784d1f95120ed170354d470b6116 Mon Sep 17 00:00:00 2001 -From: Alexandru Ardelean <alexandru.ardelean@analog.com> -Date: Tue, 7 May 2019 12:43:12 +0300 -Subject: [PATCH 09/37] bcache: use sysfs_match_string() instead of - __sysfs_match_string() - -The arrays (of strings) that are passed to __sysfs_match_string() are -static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() -over these arrays. - -Functionally, this doesn't change anything. -The change is more cosmetic. - -It only shrinks the static arrays by 1 byte each. - -Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com> -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/sysfs.c | 20 ++++++++------------ - 1 file changed, 8 insertions(+), 12 deletions(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index bfb437ffb13c..760cf8951338 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { - "writethrough", - "writeback", - "writearound", -- "none", -- NULL -+ "none" - }; - - /* Default is 0 ("auto") */ - static const char * const bch_stop_on_failure_modes[] = { - "auto", -- "always", -- NULL -+ "always" - }; - - static const char * const cache_replacement_policies[] = { - "lru", - "fifo", -- "random", -- NULL -+ "random" - }; - - static const char * const error_actions[] = { - "unregister", -- "panic", -- NULL -+ "panic" - }; - - write_attribute(attach); -@@ -333,7 +329,7 @@ STORE(__cached_dev) - bch_cached_dev_run(dc); - - if (attr == &sysfs_cache_mode) { -- v = __sysfs_match_string(bch_cache_modes, -1, buf); -+ v = sysfs_match_string(bch_cache_modes, buf); - if (v < 0) - return v; - -@@ -344,7 +340,7 @@ STORE(__cached_dev) - } - - if (attr == &sysfs_stop_when_cache_set_failed) { -- v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); -+ v = sysfs_match_string(bch_stop_on_failure_modes, buf); - if (v < 0) - return v; - -@@ -799,7 +795,7 @@ STORE(__bch_cache_set) - 0, UINT_MAX); - - if (attr == &sysfs_errors) { -- v = __sysfs_match_string(error_actions, -1, buf); -+ v = sysfs_match_string(error_actions, buf); - if (v < 0) - return v; - -@@ -1063,7 +1059,7 @@ STORE(__bch_cache) - } - - if (attr == &sysfs_cache_replacement_policy) { -- v = __sysfs_match_string(cache_replacement_policies, -1, buf); -+ v = sysfs_match_string(cache_replacement_policies, buf); - if (v < 0) - return v; - --- -2.16.4 - diff --git a/for-current/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch b/for-current/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch deleted file mode 100644 index 81058cc..0000000 --- a/for-current/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch +++ /dev/null @@ -1,152 +0,0 @@ -From 3c6554692a3361189936d5dbdcc490ee7bf86eb6 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 21 May 2019 22:16:38 +0800 -Subject: [PATCH 10/37] bcache: add return value check to bch_cached_dev_run() - -This patch adds return value check to bch_cached_dev_run(), now if there -is error happens inside bch_cached_dev_run(), it can be catched. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 2 +- - drivers/md/bcache/super.c | 33 ++++++++++++++++++++++++++------- - drivers/md/bcache/sysfs.c | 7 +++++-- - 3 files changed, 32 insertions(+), 10 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index fdf75352e16a..73a97586a2ef 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -1006,7 +1006,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); - int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - uint8_t *set_uuid); - void bch_cached_dev_detach(struct cached_dev *dc); --void bch_cached_dev_run(struct cached_dev *dc); -+int bch_cached_dev_run(struct cached_dev *dc); - void bcache_device_stop(struct bcache_device *d); - - void bch_cache_set_unregister(struct cache_set *c); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 73466bda12a7..0abee44092bf 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) - } - - --void bch_cached_dev_run(struct cached_dev *dc) -+int bch_cached_dev_run(struct cached_dev *dc) - { - struct bcache_device *d = &dc->disk; - char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); -@@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) - NULL, - }; - -+ if (dc->io_disable) -+ return -EIO; -+ - if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); -- return; -+ return -EBUSY; - } - - if (!d->c && -@@ -951,8 +954,11 @@ void bch_cached_dev_run(struct cached_dev *dc) - kfree(buf); - - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || -- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) -+ sysfs_create_link(&disk_to_dev(d->disk)->kobj, -+ &d->kobj, "bcache")) { - pr_debug("error creating sysfs link"); -+ return -ENOMEM; -+ } - - dc->status_update_thread = kthread_run(cached_dev_status_update, - dc, "bcache_status_update"); -@@ -961,6 +967,8 @@ void bch_cached_dev_run(struct cached_dev *dc) - "continue to run without monitoring backing " - "device status"); - } -+ -+ return 0; - } - - /* -@@ -1056,6 +1064,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); - struct uuid_entry *u; - struct cached_dev *exist_dc, *t; -+ int ret = 0; - - if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || - (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) -@@ -1165,7 +1174,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - - bch_sectors_dirty_init(&dc->disk); - -- bch_cached_dev_run(dc); -+ ret = bch_cached_dev_run(dc); -+ if (ret && (ret != -EBUSY)) { -+ up_write(&dc->writeback_lock); -+ return ret; -+ } -+ - bcache_device_link(&dc->disk, c, "bdev"); - atomic_inc(&c->attached_dev_nr); - -@@ -1292,6 +1306,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, - { - const char *err = "cannot allocate memory"; - struct cache_set *c; -+ int ret = -ENOMEM; - - bdevname(bdev, dc->backing_dev_name); - memcpy(&dc->sb, sb, sizeof(struct cache_sb)); -@@ -1321,14 +1336,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, - bch_cached_dev_attach(dc, c, NULL); - - if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || -- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) -- bch_cached_dev_run(dc); -+ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { -+ err = "failed to run cached device"; -+ ret = bch_cached_dev_run(dc); -+ if (ret) -+ goto err; -+ } - - return 0; - err: - pr_notice("error %s: %s", dc->backing_dev_name, err); - bcache_device_stop(&dc->disk); -- return -EIO; -+ return ret; - } - - /* Flash only volumes */ -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 760cf8951338..eb678e43ac00 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -325,8 +325,11 @@ STORE(__cached_dev) - bch_cache_accounting_clear(&dc->accounting); - - if (attr == &sysfs_running && -- strtoul_or_return(buf)) -- bch_cached_dev_run(dc); -+ strtoul_or_return(buf)) { -+ v = bch_cached_dev_run(dc); -+ if (v) -+ return v; -+ } - - if (attr == &sysfs_cache_mode) { - v = sysfs_match_string(bch_cache_modes, buf); --- -2.16.4 - diff --git a/for-current/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch b/for-current/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch deleted file mode 100644 index f76096e..0000000 --- a/for-current/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 8cf2fe851139c27cafc5e52700b483793077611e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 21 May 2019 22:36:35 +0800 -Subject: [PATCH 11/37] bcache: remove unncessary code in bch_btree_keys_init() - -Function bch_btree_keys_init() initializes b->set[].size and -b->set[].data to zero. As the code comments indicates, these code indeed -is unncessary, because both struct btree_keys and struct bset_tree are -nested embedded into struct btree, when struct btree is filled with 0 -bits by kzalloc() in mca_bucket_alloc(), b->set[].size and -b->set[].data are initialized to 0 (a.k.a NULL) already. - -This patch removes the redundant code, and add comments in -bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bset.c | 15 ++++++--------- - drivers/md/bcache/btree.c | 4 ++++ - 2 files changed, 10 insertions(+), 9 deletions(-) - -diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c -index e36a108d3648..8af9509e78bd 100644 ---- a/drivers/md/bcache/bset.c -+++ b/drivers/md/bcache/bset.c -@@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); - void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, - bool *expensive_debug_checks) - { -- unsigned int i; -- - b->ops = ops; - b->expensive_debug_checks = expensive_debug_checks; - b->nsets = 0; - b->last_set_unwritten = 0; - -- /* XXX: shouldn't be needed */ -- for (i = 0; i < MAX_BSETS; i++) -- b->set[i].size = 0; - /* -- * Second loop starts at 1 because b->keys[0]->data is the memory we -- * allocated -+ * struct btree_keys in embedded in struct btree, and struct -+ * bset_tree is embedded into struct btree_keys. They are all -+ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and -+ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we -+ * don't have to initiate b->set[].size and b->set[].data here -+ * any more. - */ -- for (i = 1; i < MAX_BSETS; i++) -- b->set[i].data = NULL; - } - EXPORT_SYMBOL(bch_btree_keys_init); - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 773f5fdad25f..cf38a1b031fa 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) - static struct btree *mca_bucket_alloc(struct cache_set *c, - struct bkey *k, gfp_t gfp) - { -+ /* -+ * kzalloc() is necessary here for initialization, -+ * see code comments in bch_btree_keys_init(). -+ */ - struct btree *b = kzalloc(sizeof(struct btree), gfp); - - if (!b) --- -2.16.4 - diff --git a/for-current/0012-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch b/for-current/0012-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch deleted file mode 100644 index d50cf9a..0000000 --- a/for-current/0012-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 83696d19f199f99cd89b65fe5eb2ab1603b3bd2e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 22 May 2019 21:55:09 +0800 -Subject: [PATCH 12/37] bcache: check CACHE_SET_IO_DISABLE in allocator code - -If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O -errors, currently allocator routines can still continue allocate -space which may introduce inconsistent metadata state. - -This patch checkes CACHE_SET_IO_DISABLE bit in following allocator -routines, -- bch_bucket_alloc() -- __bch_bucket_alloc_set() -Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines -may reject allocation request earlier to avoid potential inconsistent -metadata. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/alloc.c | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c -index f8986effcb50..6f776823b9ba 100644 ---- a/drivers/md/bcache/alloc.c -+++ b/drivers/md/bcache/alloc.c -@@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) - struct bucket *b; - long r; - -+ -+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */ -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) -+ return -1; -+ - /* fastpath */ - if (fifo_pop(&ca->free[RESERVE_NONE], r) || - fifo_pop(&ca->free[reserve], r)) -@@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - { - int i; - -+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */ -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) -+ return -1; -+ - lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); - --- -2.16.4 - diff --git a/for-current/0013-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch b/for-current/0013-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch deleted file mode 100644 index 2fb8deb..0000000 --- a/for-current/0013-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 538b8c892e46fce9c4ce5f26be7f471001054e21 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 22 May 2019 22:06:21 +0800 -Subject: [PATCH 13/37] bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() - -When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE -bit is set, bch_journal() may continue to work because the journaling -bkey might be still in write set yet. The caller of bch_journal() may -believe the journal still work but the truth is in-memory journal write -set won't be written into cache device any more. This behavior may -introduce potential inconsistent metadata status. - -This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), -if the bit is set, bch_journal() returns NULL immediately to notice -caller to know journal does not work. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 4e5fc05720fc..54f8886b6177 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -811,6 +811,10 @@ atomic_t *bch_journal(struct cache_set *c, - struct journal_write *w; - atomic_t *ret; - -+ /* No journaling if CACHE_SET_IO_DISABLE set already */ -+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) -+ return NULL; -+ - if (!CACHE_SYNC(&c->sb)) - return NULL; - --- -2.16.4 - diff --git a/for-current/0014-bcache-more-detailed-error-message-to-bcache_device_.patch b/for-current/0014-bcache-more-detailed-error-message-to-bcache_device_.patch deleted file mode 100644 index f1db3e8..0000000 --- a/for-current/0014-bcache-more-detailed-error-message-to-bcache_device_.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 3d5eca3da96c4dcfeb71a5947d1b916098f98090 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 1 Jun 2019 00:57:38 +0800 -Subject: [PATCH 14/37] bcache: more detailed error message to - bcache_device_link() - -This patch adds more accurate error message for specific -ssyfs_create_link() call, to help debugging failure during -bcache device start tup. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 11 ++++++++--- - 1 file changed, 8 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 0abee44092bf..d4d8d1300faf 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -693,6 +693,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, - { - unsigned int i; - struct cache *ca; -+ int ret; - - for_each_cache(ca, d->c, i) - bd_link_disk_holder(ca->bdev, d->disk); -@@ -700,9 +701,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, - snprintf(d->name, BCACHEDEVNAME_SIZE, - "%s%u", name, d->id); - -- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || -- sysfs_create_link(&c->kobj, &d->kobj, d->name), -- "Couldn't create device <-> cache set symlinks"); -+ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); -+ if (ret < 0) -+ pr_err("Couldn't create device -> cache set symlink"); -+ -+ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); -+ if (ret < 0) -+ pr_err("Couldn't create cache set -> device symlink"); - - clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); - } --- -2.16.4 - diff --git a/for-current/0015-bcache-add-more-error-message-in-bch_cached_dev_atta.patch b/for-current/0015-bcache-add-more-error-message-in-bch_cached_dev_atta.patch deleted file mode 100644 index 5f566aa..0000000 --- a/for-current/0015-bcache-add-more-error-message-in-bch_cached_dev_atta.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 4ece2cd8f76fd9c43d0f479f9e8c6f1d41a2c323 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 1 Jun 2019 01:03:00 +0800 -Subject: [PATCH 15/37] bcache: add more error message in - bch_cached_dev_attach() - -This patch adds more error message for attaching cached device, this is -helpful to debug code failure during bache device start up. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index d4d8d1300faf..a836910ef368 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1169,6 +1169,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - down_write(&dc->writeback_lock); - if (bch_cached_dev_writeback_start(dc)) { - up_write(&dc->writeback_lock); -+ pr_err("Couldn't start writeback facilities for %s", -+ dc->disk.disk->disk_name); - return -ENOMEM; - } - -@@ -1182,6 +1184,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - ret = bch_cached_dev_run(dc); - if (ret && (ret != -EBUSY)) { - up_write(&dc->writeback_lock); -+ pr_err("Couldn't run cached device %s", -+ dc->backing_dev_name); - return ret; - } - --- -2.16.4 - diff --git a/for-current/0016-bcache-improve-error-message-in-bch_cached_dev_run.patch b/for-current/0016-bcache-improve-error-message-in-bch_cached_dev_run.patch deleted file mode 100644 index 777cc6f..0000000 --- a/for-current/0016-bcache-improve-error-message-in-bch_cached_dev_run.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 1c01818c558a50c66e86dce4196c8ef525dcbf58 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 4 Jun 2019 23:12:10 +0800 -Subject: [PATCH 16/37] bcache: improve error message in bch_cached_dev_run() - -This patch adds more error message in bch_cached_dev_run() to indicate -the exact reason why an error value is returned. Please notice when -printing out the "is running already" message, pr_info() is used here, -because in this case also -EBUSY is returned, the bcache device can -continue to attach to the cache devince and run, so it won't be an -error level message in kernel message. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a836910ef368..e9e6d653bf70 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -926,13 +926,18 @@ int bch_cached_dev_run(struct cached_dev *dc) - NULL, - }; - -- if (dc->io_disable) -+ if (dc->io_disable) { -+ pr_err("I/O disabled on cached dev %s", -+ dc->backing_dev_name); - return -EIO; -+ } - - if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); -+ pr_info("cached dev %s is running already", -+ dc->backing_dev_name); - return -EBUSY; - } - -@@ -961,7 +966,7 @@ int bch_cached_dev_run(struct cached_dev *dc) - if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, - &d->kobj, "bcache")) { -- pr_debug("error creating sysfs link"); -+ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); - return -ENOMEM; - } - --- -2.16.4 - diff --git a/for-current/0017-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-current/0017-bcache-remove-XXX-comment-line-from-run_cache_set.patch deleted file mode 100644 index 7c374f6..0000000 --- a/for-current/0017-bcache-remove-XXX-comment-line-from-run_cache_set.patch +++ /dev/null @@ -1,31 +0,0 @@ -From d3227df55cfc8d09d733d48619663401bc3862d1 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 13 May 2019 23:47:38 +0800 -Subject: [PATCH 17/37] bcache: remove "XXX:" comment line from run_cache_set() - -In previous bcache patches for Linux v5.2, the failure code path of -run_cache_set() is tested and fixed. So now the following comment -line can be removed from run_cache_set(), - /* XXX: test this, it's broken */ - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index e9e6d653bf70..c53fe0f1629f 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1979,7 +1979,7 @@ static int run_cache_set(struct cache_set *c) - } - - closure_sync(&cl); -- /* XXX: test this, it's broken */ -+ - bch_cache_set_error(c, "%s", err); - - return -EIO; --- -2.16.4 - diff --git a/for-current/0018-bcache-make-bset_search_tree-be-more-understandable.patch b/for-current/0018-bcache-make-bset_search_tree-be-more-understandable.patch deleted file mode 100644 index 8b48fca..0000000 --- a/for-current/0018-bcache-make-bset_search_tree-be-more-understandable.patch +++ /dev/null @@ -1,70 +0,0 @@ -From bb73929cd5ef3fe192253d7f74afb448c13d01f7 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 14 May 2019 22:51:40 +0800 -Subject: [PATCH 18/37] bcache: make bset_search_tree() be more understandable - -The purpose of following code in bset_search_tree() is to avoid a branch -instruction, - 994 if (likely(f->exponent != 127)) - 995 n = j * 2 + (((unsigned int) - 996 (f->mantissa - - 997 bfloat_mantissa(search, f))) >> 31); - 998 else - 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) -1000 ? j * 2 -1001 : j * 2 + 1; - -This piece of code is not very clear to understand, even when I tried to -add code comment for it, I made mistake. This patch removes the implict -bit operation and uses explicit branch to calculate next location in -binary tree search. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bset.c | 30 +++++++++++------------------- - 1 file changed, 11 insertions(+), 19 deletions(-) - -diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c -index 8af9509e78bd..08768796b543 100644 ---- a/drivers/md/bcache/bset.c -+++ b/drivers/md/bcache/bset.c -@@ -975,25 +975,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, - j = n; - f = &t->tree[j]; - -- /* -- * Similar bit trick, use subtract operation to avoid a branch -- * instruction. -- * -- * n = (f->mantissa > bfloat_mantissa()) -- * ? j * 2 -- * : j * 2 + 1; -- * -- * We need to subtract 1 from f->mantissa for the sign bit trick -- * to work - that's done in make_bfloat() -- */ -- if (likely(f->exponent != 127)) -- n = j * 2 + (((unsigned int) -- (f->mantissa - -- bfloat_mantissa(search, f))) >> 31); -- else -- n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) -- ? j * 2 -- : j * 2 + 1; -+ if (likely(f->exponent != 127)) { -+ if (f->mantissa >= bfloat_mantissa(search, f)) -+ n = j * 2; -+ else -+ n = j * 2 + 1; -+ } else { -+ if (bkey_cmp(tree_to_bkey(t, j), search) > 0) -+ n = j * 2; -+ else -+ n = j * 2 + 1; -+ } - } while (n < t->size); - - inorder = to_inorder(j, t); --- -2.16.4 - diff --git a/for-current/0019-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch b/for-current/0019-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch deleted file mode 100644 index 1e0d93d..0000000 --- a/for-current/0019-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch +++ /dev/null @@ -1,107 +0,0 @@ -From 0d36cf832c884a5e8aac9dcf1739376a027026e0 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Mar 2019 23:11:59 +0800 -Subject: [PATCH 19/37] bcache: add pendings_cleanup to stop pending bcache - device - -If a bcache device is in dirty state and its cache set is not -registered, this bcache device will not appear in /dev/bcache<N>, -and there is no way to stop it or remove the bcache kernel module. - -This is an as-designed behavior, but sometimes people has to reboot -whole system to release or stop the pending backing device. - -This sysfs interface may remove such pending bcache devices when -write anything into the sysfs file manually. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 55 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index c53fe0f1629f..c4c4b2d99dc2 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2273,9 +2273,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, - - static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - const char *buffer, size_t size); -+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, -+ struct kobj_attribute *attr, -+ const char *buffer, size_t size); - - kobj_attribute_write(register, register_bcache); - kobj_attribute_write(register_quiet, register_bcache); -+kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); - - static bool bch_is_open_backing(struct block_device *bdev) - { -@@ -2400,6 +2404,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - goto out; - } - -+ -+struct pdev { -+ struct list_head list; -+ struct cached_dev *dc; -+}; -+ -+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, -+ struct kobj_attribute *attr, -+ const char *buffer, -+ size_t size) -+{ -+ LIST_HEAD(pending_devs); -+ ssize_t ret = size; -+ struct cached_dev *dc, *tdc; -+ struct pdev *pdev, *tpdev; -+ struct cache_set *c, *tc; -+ -+ mutex_lock(&bch_register_lock); -+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { -+ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); -+ if (!pdev) -+ break; -+ pdev->dc = dc; -+ list_add(&pdev->list, &pending_devs); -+ } -+ -+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { -+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { -+ char *pdev_set_uuid = pdev->dc->sb.set_uuid; -+ char *set_uuid = c->sb.uuid; -+ -+ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { -+ list_del(&pdev->list); -+ kfree(pdev); -+ break; -+ } -+ } -+ } -+ mutex_unlock(&bch_register_lock); -+ -+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { -+ pr_info("delete pdev %p", pdev); -+ list_del(&pdev->list); -+ bcache_device_stop(&pdev->dc->disk); -+ kfree(pdev); -+ } -+ -+ return ret; -+} -+ - static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - { - if (code == SYS_DOWN || -@@ -2518,6 +2572,7 @@ static int __init bcache_init(void) - static const struct attribute *files[] = { - &ksysfs_register.attr, - &ksysfs_register_quiet.attr, -+ &ksysfs_pendings_cleanup.attr, - NULL - }; - --- -2.16.4 - diff --git a/for-current/0020-bcache-fix-mistaken-sysfs-entry-for-io_error-counter.patch b/for-current/0020-bcache-fix-mistaken-sysfs-entry-for-io_error-counter.patch deleted file mode 100644 index a6c25ca..0000000 --- a/for-current/0020-bcache-fix-mistaken-sysfs-entry-for-io_error-counter.patch +++ /dev/null @@ -1,43 +0,0 @@ -From e5d866519cd41e45afd27256dfa1bd9adc056331 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 16 Jun 2019 23:59:12 +0800 -Subject: [PATCH 20/37] bcache: fix mistaken sysfs entry for io_error counter - -In bch_cached_dev_files[] from driver/md/bcache/sysfs.c, sysfs_errors is -incorrectly inserted in. The correct entry should be sysfs_io_errors. - -This patch fixes the problem and now I/O errors of cached device can be -read from /sys/block/bcache<N>/bcache/io_errors. - -Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org ---- - drivers/md/bcache/sysfs.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index eb678e43ac00..dddb8d4048ce 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -176,7 +176,7 @@ SHOW(__bch_cached_dev) - var_print(writeback_percent); - sysfs_hprint(writeback_rate, - wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); -- sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); -+ sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors)); - sysfs_printf(io_error_limit, "%i", dc->error_limit); - sysfs_printf(io_disable, "%i", dc->io_disable); - var_print(writeback_rate_update_seconds); -@@ -463,7 +463,7 @@ static struct attribute *bch_cached_dev_files[] = { - &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_minimum, - &sysfs_writeback_rate_debug, -- &sysfs_errors, -+ &sysfs_io_errors, - &sysfs_io_error_limit, - &sysfs_io_disable, - &sysfs_dirty_data, --- -2.16.4 - diff --git a/for-current/0021-bcache-destroy-dc-writeback_write_wq-if-failed-to-cr.patch b/for-current/0021-bcache-destroy-dc-writeback_write_wq-if-failed-to-cr.patch deleted file mode 100644 index 6e40e1f..0000000 --- a/for-current/0021-bcache-destroy-dc-writeback_write_wq-if-failed-to-cr.patch +++ /dev/null @@ -1,35 +0,0 @@ -From dbc25640a571d5ee3d90380fcbaff5ce4a2b77ef Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 17 Jun 2019 00:06:58 +0800 -Subject: [PATCH 21/37] bcache: destroy dc->writeback_write_wq if failed to - create dc->writeback_thread - -Commit 9baf30972b55 ("bcache: fix for gc and write-back race") added a -new work queue dc->writeback_write_wq, but forgot to destroy it in the -error condition when creating dc->writeback_thread failed. - -This patch destroys dc->writeback_write_wq if kthread_create() returns -error pointer to dc->writeback_thread, then a memory leak is avoided. - -Fixes: 9baf30972b55 ("bcache: fix for gc and write-back race") -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org ---- - drivers/md/bcache/writeback.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 262f7ef20992..21081febcb59 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -833,6 +833,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - "bcache_writeback"); - if (IS_ERR(dc->writeback_thread)) { - cached_dev_put(dc); -+ destroy_workqueue(dc->writeback_write_wq); - return PTR_ERR(dc->writeback_thread); - } - dc->writeback_running = true; --- -2.16.4 - diff --git a/for-current/0022-bcache-stop-writeback-kthread-and-kworker-when-bch_c.patch b/for-current/0022-bcache-stop-writeback-kthread-and-kworker-when-bch_c.patch deleted file mode 100644 index 93d0881..0000000 --- a/for-current/0022-bcache-stop-writeback-kthread-and-kworker-when-bch_c.patch +++ /dev/null @@ -1,42 +0,0 @@ -From fd2668ce597ccaf68a102d6cda906e359e2de4b6 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 17 Jun 2019 23:03:02 +0800 -Subject: [PATCH 22/37] bcache: stop writeback kthread and kworker when - bch_cached_dev_run() failed - -In bch_cached_dev_attach() after bch_cached_dev_writeback_start() -called, the wrireback kthread and writeback rate update kworker of the -cached device are created, if the following bch_cached_dev_run() -failed, bch_cached_dev_attach() will return with -ENOMEM without -stopping the writeback related kthread and kworker. - -This patch stops writeback kthread and writeback rate update kworker -before returning -ENOMEM if bch_cached_dev_run() returns error. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index c4c4b2d99dc2..791cb930b353 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1189,6 +1189,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - ret = bch_cached_dev_run(dc); - if (ret && (ret != -EBUSY)) { - up_write(&dc->writeback_lock); -+ /* -+ * bch_register_lock is held, bcache_device_stop() is not -+ * able to be directly called. The kthread and kworker -+ * created previously in bch_cached_dev_writeback_start() -+ * have to be stopped manually here. -+ */ -+ kthread_stop(dc->writeback_thread); -+ cancel_writeback_rate_update_dwork(dc); - pr_err("Couldn't run cached device %s", - dc->backing_dev_name); - return ret; --- -2.16.4 - diff --git a/for-current/0023-bcache-avoid-a-deadlock-in-bcache_reboot.patch b/for-current/0023-bcache-avoid-a-deadlock-in-bcache_reboot.patch deleted file mode 100644 index 128ea9f..0000000 --- a/for-current/0023-bcache-avoid-a-deadlock-in-bcache_reboot.patch +++ /dev/null @@ -1,211 +0,0 @@ -From dec31a9984ccded737345da74b8657dbf3c78ea4 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 21 May 2019 23:19:55 +0800 -Subject: [PATCH 23/37] bcache: avoid a deadlock in bcache_reboot() - -It is quite frequently to observe deadlock in bcache_reboot() happens -and hang the system reboot process. The reason is, in bcache_reboot() -when calling bch_cache_set_stop() and bcache_device_stop() the mutex -bch_register_lock is held. But in the process to stop cache set and -bcache device, bch_register_lock will be acquired again. If this mutex -is held here, deadlock will happen inside the stopping process. The -aftermath of the deadlock is, whole system reboot gets hung. - -The fix is to avoid holding bch_register_lock for the following loops -in bcache_reboot(), - list_for_each_entry_safe(c, tc, &bch_cache_sets, list) - bch_cache_set_stop(c); - - list_for_each_entry_safe(dc, tdc, &uncached_devices, list) - bcache_device_stop(&dc->disk); - -A module range variable 'bcache_is_reboot' is added, it sets to true -in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked -to be true, reject the registration by returning -EBUSY immediately. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 40 +++++++++++++++++++++++++++++++++++++++- - drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ - 2 files changed, 65 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 791cb930b353..a88238ad5da1 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -40,6 +40,7 @@ static const char invalid_uuid[] = { - - static struct kobject *bcache_kobj; - struct mutex bch_register_lock; -+bool bcache_is_reboot; - LIST_HEAD(bch_cache_sets); - static LIST_HEAD(uncached_devices); - -@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; - struct workqueue_struct *bcache_wq; - struct workqueue_struct *bch_journal_wq; - -+ - #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) - /* limitation of partitions number on single bcache device */ - #define BCACHE_MINORS 128 -@@ -2335,6 +2337,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - -+ /* For latest state of bcache_is_reboot */ -+ smp_mb(); -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - path = kstrndup(buffer, size, GFP_KERNEL); - if (!path) - goto err; -@@ -2464,6 +2471,9 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, - - static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - { -+ if (bcache_is_reboot) -+ return NOTIFY_DONE; -+ - if (code == SYS_DOWN || - code == SYS_HALT || - code == SYS_POWER_OFF) { -@@ -2476,19 +2486,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - - mutex_lock(&bch_register_lock); - -+ if (bcache_is_reboot) -+ goto out; -+ -+ /* New registration is rejected since now */ -+ bcache_is_reboot = true; -+ /* -+ * Make registering caller (if there is) on other CPU -+ * core know bcache_is_reboot set to true earlier -+ */ -+ smp_mb(); -+ - if (list_empty(&bch_cache_sets) && - list_empty(&uncached_devices)) - goto out; - -+ mutex_unlock(&bch_register_lock); -+ - pr_info("Stopping all devices:"); - -+ /* -+ * The reason bch_register_lock is not held to call -+ * bch_cache_set_stop() and bcache_device_stop() is to -+ * avoid potential deadlock during reboot, because cache -+ * set or bcache device stopping process will acqurie -+ * bch_register_lock too. -+ * -+ * We are safe here because bcache_is_reboot sets to -+ * true already, register_bcache() will reject new -+ * registration now. bcache_is_reboot also makes sure -+ * bcache_reboot() won't be re-entered on by other thread, -+ * so there is no race in following list iteration by -+ * list_for_each_entry_safe(). -+ */ - list_for_each_entry_safe(c, tc, &bch_cache_sets, list) - bch_cache_set_stop(c); - - list_for_each_entry_safe(dc, tdc, &uncached_devices, list) - bcache_device_stop(&dc->disk); - -- mutex_unlock(&bch_register_lock); - - /* - * Give an early chance for other kthreads and -@@ -2616,6 +2652,8 @@ static int __init bcache_init(void) - bch_debug_init(); - closure_debug_init(); - -+ bcache_is_reboot = false; -+ - return 0; - err: - bcache_exit(); -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index dddb8d4048ce..d62e28643109 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -16,6 +16,8 @@ - #include <linux/sort.h> - #include <linux/sched/clock.h> - -+extern bool bcache_is_reboot; -+ - /* Default is 0 ("writethrough") */ - static const char * const bch_cache_modes[] = { - "writethrough", -@@ -267,6 +269,10 @@ STORE(__cached_dev) - struct cache_set *c; - struct kobj_uevent_env *env; - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - #define d_strtoul(var) sysfs_strtoul(var, dc->var) - #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) - #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) -@@ -407,6 +413,10 @@ STORE(bch_cached_dev) - struct cached_dev *dc = container_of(kobj, struct cached_dev, - disk.kobj); - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - mutex_lock(&bch_register_lock); - size = __cached_dev_store(kobj, attr, buf, size); - -@@ -510,6 +520,10 @@ STORE(__bch_flash_dev) - kobj); - struct uuid_entry *u = &d->c->uuids[d->id]; - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - sysfs_strtoul(data_csum, d->data_csum); - - if (attr == &sysfs_size) { -@@ -745,6 +759,10 @@ STORE(__bch_cache_set) - struct cache_set *c = container_of(kobj, struct cache_set, kobj); - ssize_t v; - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - if (attr == &sysfs_unregister) - bch_cache_set_unregister(c); - -@@ -864,6 +882,10 @@ STORE(bch_cache_set_internal) - { - struct cache_set *c = container_of(kobj, struct cache_set, internal); - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - return bch_cache_set_store(&c->kobj, attr, buf, size); - } - -@@ -1049,6 +1071,10 @@ STORE(__bch_cache) - struct cache *ca = container_of(kobj, struct cache, kobj); - ssize_t v; - -+ /* no user space access if system is rebooting */ -+ if (bcache_is_reboot) -+ return -EBUSY; -+ - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - --- -2.16.4 - diff --git a/for-current/0024-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-current/0024-bcache-acquire-bch_register_lock-later-in-cached_dev.patch deleted file mode 100644 index ea0c810..0000000 --- a/for-current/0024-bcache-acquire-bch_register_lock-later-in-cached_dev.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 78af82b45144e562e87ac70f0b4710c96bec04ff Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 2 Jun 2019 01:06:12 +0800 -Subject: [PATCH 24/37] bcache: acquire bch_register_lock later in - cached_dev_detach_finish() - -Now there is variable bcache_is_reboot to prevent device register or -unregister during reboot, it is unncessary to still hold mutex lock -bch_register_lock before stopping writeback_rate_update kworker and -writeback kthread. And if the stopping kworker or kthread holding -bch_register_lock inside their routine (we used to have such problem -in writeback thread, thanks to Junhui Wang fixed it), it is very easy -to introduce deadlock during reboot/shutdown procedure. - -Therefore in this patch, the location to acquire bch_register_lock is -moved to the location before calling calc_cached_dev_sectors(). Which -is later then original location in cached_dev_detach_finish(). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a88238ad5da1..40d857e690f9 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) - BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(refcount_read(&dc->count)); - -- mutex_lock(&bch_register_lock); - - if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) - cancel_writeback_rate_update_dwork(dc); -@@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - -+ mutex_lock(&bch_register_lock); -+ - calc_cached_dev_sectors(dc->disk.c); - bcache_device_detach(&dc->disk); - list_move(&dc->list, &uncached_devices); --- -2.16.4 - diff --git a/for-current/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-current/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch deleted file mode 100644 index 0884f6a..0000000 --- a/for-current/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch +++ /dev/null @@ -1,160 +0,0 @@ -From ee7b60589c0e89a38ded0885d5810c652f343e3e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 12 Jun 2019 21:10:38 +0800 -Subject: [PATCH 25/37] bcache: acquire bch_register_lock later in - cached_dev_free() - -When enable lockdep engine, a lockdep warning can be observed when -reboot or shutdown system, - -[ 3142.764557][ T1] bcache: bcache_reboot() Stopping all devices: -[ 3142.776265][ T2649] -[ 3142.777159][ T2649] ====================================================== -[ 3142.780039][ T2649] WARNING: possible circular locking dependency detected -[ 3142.782869][ T2649] 5.2.0-rc4-lp151.20-default+ #1 Tainted: G W -[ 3142.785684][ T2649] ------------------------------------------------------ -[ 3142.788479][ T2649] kworker/3:67/2649 is trying to acquire lock: -[ 3142.790738][ T2649] 00000000aaf02291 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 -[ 3142.794678][ T2649] -[ 3142.794678][ T2649] but task is already holding lock: -[ 3142.797402][ T2649] 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] -[ 3142.801462][ T2649] -[ 3142.801462][ T2649] which lock already depends on the new lock. -[ 3142.801462][ T2649] -[ 3142.805277][ T2649] -[ 3142.805277][ T2649] the existing dependency chain (in reverse order) is: -[ 3142.808902][ T2649] -[ 3142.808902][ T2649] -> #2 (&bch_register_lock){+.+.}: -[ 3142.812396][ T2649] __mutex_lock+0x7a/0x9d0 -[ 3142.814184][ T2649] cached_dev_free+0x17/0x120 [bcache] -[ 3142.816415][ T2649] process_one_work+0x2a4/0x640 -[ 3142.818413][ T2649] worker_thread+0x39/0x3f0 -[ 3142.820276][ T2649] kthread+0x125/0x140 -[ 3142.822061][ T2649] ret_from_fork+0x3a/0x50 -[ 3142.823965][ T2649] -[ 3142.823965][ T2649] -> #1 ((work_completion)(&cl->work)#2){+.+.}: -[ 3142.827244][ T2649] process_one_work+0x277/0x640 -[ 3142.829160][ T2649] worker_thread+0x39/0x3f0 -[ 3142.830958][ T2649] kthread+0x125/0x140 -[ 3142.832674][ T2649] ret_from_fork+0x3a/0x50 -[ 3142.834915][ T2649] -[ 3142.834915][ T2649] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: -[ 3142.838121][ T2649] lock_acquire+0xb4/0x1c0 -[ 3142.840025][ T2649] flush_workqueue+0xae/0x4c0 -[ 3142.842035][ T2649] drain_workqueue+0xa9/0x180 -[ 3142.844042][ T2649] destroy_workqueue+0x17/0x250 -[ 3142.846142][ T2649] cached_dev_free+0x52/0x120 [bcache] -[ 3142.848530][ T2649] process_one_work+0x2a4/0x640 -[ 3142.850663][ T2649] worker_thread+0x39/0x3f0 -[ 3142.852464][ T2649] kthread+0x125/0x140 -[ 3142.854106][ T2649] ret_from_fork+0x3a/0x50 -[ 3142.855880][ T2649] -[ 3142.855880][ T2649] other info that might help us debug this: -[ 3142.855880][ T2649] -[ 3142.859663][ T2649] Chain exists of: -[ 3142.859663][ T2649] (wq_completion)bcache_writeback_wq --> (work_completion)(&cl->work)#2 --> &bch_register_lock -[ 3142.859663][ T2649] -[ 3142.865424][ T2649] Possible unsafe locking scenario: -[ 3142.865424][ T2649] -[ 3142.868022][ T2649] CPU0 CPU1 -[ 3142.869885][ T2649] ---- ---- -[ 3142.871751][ T2649] lock(&bch_register_lock); -[ 3142.873379][ T2649] lock((work_completion)(&cl->work)#2); -[ 3142.876399][ T2649] lock(&bch_register_lock); -[ 3142.879727][ T2649] lock((wq_completion)bcache_writeback_wq); -[ 3142.882064][ T2649] -[ 3142.882064][ T2649] *** DEADLOCK *** -[ 3142.882064][ T2649] -[ 3142.885060][ T2649] 3 locks held by kworker/3:67/2649: -[ 3142.887245][ T2649] #0: 00000000e774cdd0 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 -[ 3142.890815][ T2649] #1: 00000000f7df89da ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 -[ 3142.894884][ T2649] #2: 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] -[ 3142.898797][ T2649] -[ 3142.898797][ T2649] stack backtrace: -[ 3142.900961][ T2649] CPU: 3 PID: 2649 Comm: kworker/3:67 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 -[ 3142.904789][ T2649] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 -[ 3142.909168][ T2649] Workqueue: events cached_dev_free [bcache] -[ 3142.911422][ T2649] Call Trace: -[ 3142.912656][ T2649] dump_stack+0x85/0xcb -[ 3142.914181][ T2649] print_circular_bug+0x19a/0x1f0 -[ 3142.916193][ T2649] __lock_acquire+0x16cd/0x1850 -[ 3142.917936][ T2649] ? __lock_acquire+0x6a8/0x1850 -[ 3142.919704][ T2649] ? lock_acquire+0xb4/0x1c0 -[ 3142.921335][ T2649] ? find_held_lock+0x34/0xa0 -[ 3142.923052][ T2649] lock_acquire+0xb4/0x1c0 -[ 3142.924635][ T2649] ? flush_workqueue+0x87/0x4c0 -[ 3142.926375][ T2649] flush_workqueue+0xae/0x4c0 -[ 3142.928047][ T2649] ? flush_workqueue+0x87/0x4c0 -[ 3142.929824][ T2649] ? drain_workqueue+0xa9/0x180 -[ 3142.931686][ T2649] drain_workqueue+0xa9/0x180 -[ 3142.933534][ T2649] destroy_workqueue+0x17/0x250 -[ 3142.935787][ T2649] cached_dev_free+0x52/0x120 [bcache] -[ 3142.937795][ T2649] process_one_work+0x2a4/0x640 -[ 3142.939803][ T2649] worker_thread+0x39/0x3f0 -[ 3142.941487][ T2649] ? process_one_work+0x640/0x640 -[ 3142.943389][ T2649] kthread+0x125/0x140 -[ 3142.944894][ T2649] ? kthread_create_worker_on_cpu+0x70/0x70 -[ 3142.947744][ T2649] ret_from_fork+0x3a/0x50 -[ 3142.970358][ T2649] bcache: bcache_device_free() bcache0 stopped - -Here is how the deadlock happens. -1) bcache_reboot() calls bcache_device_stop(), then inside - bcache_device_stop() BCACHE_DEV_CLOSING bit is set on d->flags. - Then closure_queue(&d->cl) is called to invoke cached_dev_flush(). -2) In cached_dev_flush(), cached_dev_free() is called by continu_at(). -3) In cached_dev_free(), when stopping the writeback kthread of the - cached device by kthread_stop(), dc->writeback_thread will be waken - up to quite the kthread while-loop, then cached_dev_put() is called - in bch_writeback_thread(). -4) Calling cached_dev_put() in writeback kthread may drop dc->count to - 0, then dc->detach kworker is scheduled, which is initialized as - cached_dev_detach_finish(). -5) Inside cached_dev_detach_finish(), the last line of code is to call - closure_put(&dc->disk.cl), which drops the last reference counter of - closrure dc->disk.cl, then the callback cached_dev_flush() gets - called. -Now cached_dev_flush() is called for second time in the code path, the -first time is in step 2). And again bch_register_lock will be acquired -again, and a A-A lock (lockdep terminology) is happening. - -The root cause of the above A-A lock is in cached_dev_free(), mutex -bch_register_lock is held before stopping writeback kthread and other -kworkers. Fortunately now we have variable 'bcache_is_reboot', which may -prevent device registration or unregistration during reboot/shutdown -time, so it is unncessary to hold bch_register_lock such early now. - -This is how this patch fixes the reboot/shutdown time A-A lock issue: -After moving mutex_lock(&bch_register_lock) to a later location where -before atomic_read(&dc->running) in cached_dev_free(), such A-A lock -problem can be solved without any reboot time registration race. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 40d857e690f9..8a12a8313367 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1231,8 +1231,6 @@ static void cached_dev_free(struct closure *cl) - { - struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - -- mutex_lock(&bch_register_lock); -- - if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) - cancel_writeback_rate_update_dwork(dc); - -@@ -1243,6 +1241,8 @@ static void cached_dev_free(struct closure *cl) - if (!IS_ERR_OR_NULL(dc->status_update_thread)) - kthread_stop(dc->status_update_thread); - -+ mutex_lock(&bch_register_lock); -+ - if (atomic_read(&dc->running)) - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); - bcache_device_free(&dc->disk); --- -2.16.4 - diff --git a/for-current/0026-bcache-fix-potential-deadlock-in-cached_def_free.patch b/for-current/0026-bcache-fix-potential-deadlock-in-cached_def_free.patch deleted file mode 100644 index 6178515..0000000 --- a/for-current/0026-bcache-fix-potential-deadlock-in-cached_def_free.patch +++ /dev/null @@ -1,168 +0,0 @@ -From 9076f3622e5ed9a65b67ae47bba6c3a8f5c0e5d2 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 4 Jun 2019 14:28:33 +0800 -Subject: [PATCH 26/37] bcache: fix potential deadlock in cached_def_free() - -When enable lockdep and reboot system with a writeback mode bcache -device, the following potential deadlock warning is reported by lockdep -engine. - -[ 101.536569][ T401] kworker/2:2/401 is trying to acquire lock: -[ 101.538575][ T401] 00000000bbf6e6c7 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 -[ 101.542054][ T401] -[ 101.542054][ T401] but task is already holding lock: -[ 101.544587][ T401] 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 -[ 101.548386][ T401] -[ 101.548386][ T401] which lock already depends on the new lock. -[ 101.548386][ T401] -[ 101.551874][ T401] -[ 101.551874][ T401] the existing dependency chain (in reverse order) is: -[ 101.555000][ T401] -[ 101.555000][ T401] -> #1 ((work_completion)(&cl->work)#2){+.+.}: -[ 101.557860][ T401] process_one_work+0x277/0x640 -[ 101.559661][ T401] worker_thread+0x39/0x3f0 -[ 101.561340][ T401] kthread+0x125/0x140 -[ 101.562963][ T401] ret_from_fork+0x3a/0x50 -[ 101.564718][ T401] -[ 101.564718][ T401] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: -[ 101.567701][ T401] lock_acquire+0xb4/0x1c0 -[ 101.569651][ T401] flush_workqueue+0xae/0x4c0 -[ 101.571494][ T401] drain_workqueue+0xa9/0x180 -[ 101.573234][ T401] destroy_workqueue+0x17/0x250 -[ 101.575109][ T401] cached_dev_free+0x44/0x120 [bcache] -[ 101.577304][ T401] process_one_work+0x2a4/0x640 -[ 101.579357][ T401] worker_thread+0x39/0x3f0 -[ 101.581055][ T401] kthread+0x125/0x140 -[ 101.582709][ T401] ret_from_fork+0x3a/0x50 -[ 101.584592][ T401] -[ 101.584592][ T401] other info that might help us debug this: -[ 101.584592][ T401] -[ 101.588355][ T401] Possible unsafe locking scenario: -[ 101.588355][ T401] -[ 101.590974][ T401] CPU0 CPU1 -[ 101.592889][ T401] ---- ---- -[ 101.594743][ T401] lock((work_completion)(&cl->work)#2); -[ 101.596785][ T401] lock((wq_completion)bcache_writeback_wq); -[ 101.600072][ T401] lock((work_completion)(&cl->work)#2); -[ 101.602971][ T401] lock((wq_completion)bcache_writeback_wq); -[ 101.605255][ T401] -[ 101.605255][ T401] *** DEADLOCK *** -[ 101.605255][ T401] -[ 101.608310][ T401] 2 locks held by kworker/2:2/401: -[ 101.610208][ T401] #0: 00000000cf2c7d17 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 -[ 101.613709][ T401] #1: 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 -[ 101.617480][ T401] -[ 101.617480][ T401] stack backtrace: -[ 101.619539][ T401] CPU: 2 PID: 401 Comm: kworker/2:2 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 -[ 101.623225][ T401] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 -[ 101.627210][ T401] Workqueue: events cached_dev_free [bcache] -[ 101.629239][ T401] Call Trace: -[ 101.630360][ T401] dump_stack+0x85/0xcb -[ 101.631777][ T401] print_circular_bug+0x19a/0x1f0 -[ 101.633485][ T401] __lock_acquire+0x16cd/0x1850 -[ 101.635184][ T401] ? __lock_acquire+0x6a8/0x1850 -[ 101.636863][ T401] ? lock_acquire+0xb4/0x1c0 -[ 101.638421][ T401] ? find_held_lock+0x34/0xa0 -[ 101.640015][ T401] lock_acquire+0xb4/0x1c0 -[ 101.641513][ T401] ? flush_workqueue+0x87/0x4c0 -[ 101.643248][ T401] flush_workqueue+0xae/0x4c0 -[ 101.644832][ T401] ? flush_workqueue+0x87/0x4c0 -[ 101.646476][ T401] ? drain_workqueue+0xa9/0x180 -[ 101.648303][ T401] drain_workqueue+0xa9/0x180 -[ 101.649867][ T401] destroy_workqueue+0x17/0x250 -[ 101.651503][ T401] cached_dev_free+0x44/0x120 [bcache] -[ 101.653328][ T401] process_one_work+0x2a4/0x640 -[ 101.655029][ T401] worker_thread+0x39/0x3f0 -[ 101.656693][ T401] ? process_one_work+0x640/0x640 -[ 101.658501][ T401] kthread+0x125/0x140 -[ 101.660012][ T401] ? kthread_create_worker_on_cpu+0x70/0x70 -[ 101.661985][ T401] ret_from_fork+0x3a/0x50 -[ 101.691318][ T401] bcache: bcache_device_free() bcache0 stopped - -Here is how the above potential deadlock may happen in reboot/shutdown -code path, -1) bcache_reboot() is called firstly in the reboot/shutdown code path, - then in bcache_reboot(), bcache_device_stop() is called. -2) bcache_device_stop() sets BCACHE_DEV_CLOSING on d->falgs, then call - closure_queue(&d->cl) to invoke cached_dev_flush(). And in turn - cached_dev_flush() calls cached_dev_free() via closure_at() -3) In cached_dev_free(), after stopped writebach kthread - dc->writeback_thread, the kwork dc->writeback_write_wq is stopping by - destroy_workqueue(). -4) Inside destroy_workqueue(), drain_workqueue() is called. Inside - drain_workqueue(), flush_workqueue() is called. Then wq->lockdep_map - is acquired by lock_map_acquire() in flush_workqueue(). After the - lock acquired the rest part of flush_workqueue() just wait for the - workqueue to complete. -5) Now we look back at writeback thread routine bch_writeback_thread(), - in the main while-loop, write_dirty() is called via continue_at() in - read_dirty_submit(), which is called via continue_at() in while-loop - level called function read_dirty(). Inside write_dirty() it may be - re-called on workqueeu dc->writeback_write_wq via continue_at(). - It means when the writeback kthread is stopped in cached_dev_free() - there might be still one kworker queued on dc->writeback_write_wq - to execute write_dirty() again. -6) Now this kworker is scheduled on dc->writeback_write_wq to run by - process_one_work() (which is called by worker_thread()). Before - calling the kwork routine, wq->lockdep_map is acquired. -7) But wq->lockdep_map is acquired already in step 4), so a A-A lock - (lockdep terminology) scenario happens. - -Indeed on multiple cores syatem, the above deadlock is very rare to -happen, just as the code comments in process_one_work() says, -2263 * AFAICT there is no possible deadlock scenario between the -2264 * flush_work() and complete() primitives (except for - single-threaded -2265 * workqueues), so hiding them isn't a problem. - -But it is still good to fix such lockdep warning, even no one running -bcache on single core system. - -The fix is simple. This patch solves the above potential deadlock by, -- Do not destroy workqueue dc->writeback_write_wq in cached_dev_free(). -- Flush and destroy dc->writeback_write_wq in writebach kthread routine - bch_writeback_thread(), where after quit the thread main while-loop - and before cached_dev_put() is called. - -By this fix, dc->writeback_write_wq will be stopped and destroy before -the writeback kthread stopped, so the chance for a A-A locking on -wq->lockdep_map is disappeared, such A-A deadlock won't happen -any more. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 2 -- - drivers/md/bcache/writeback.c | 4 ++++ - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 8a12a8313367..a8ea4e2086a9 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1236,8 +1236,6 @@ static void cached_dev_free(struct closure *cl) - - if (!IS_ERR_OR_NULL(dc->writeback_thread)) - kthread_stop(dc->writeback_thread); -- if (dc->writeback_write_wq) -- destroy_workqueue(dc->writeback_write_wq); - if (!IS_ERR_OR_NULL(dc->status_update_thread)) - kthread_stop(dc->status_update_thread); - -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 21081febcb59..d60268fe49e1 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -738,6 +738,10 @@ static int bch_writeback_thread(void *arg) - } - } - -+ if (dc->writeback_write_wq) { -+ flush_workqueue(dc->writeback_write_wq); -+ destroy_workqueue(dc->writeback_write_wq); -+ } - cached_dev_put(dc); - wait_for_kthread_stop(); - --- -2.16.4 - diff --git a/for-current/0027-bcache-add-code-comments-for-journal_read_bucket.patch b/for-current/0027-bcache-add-code-comments-for-journal_read_bucket.patch deleted file mode 100644 index e04fec8..0000000 --- a/for-current/0027-bcache-add-code-comments-for-journal_read_bucket.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 4a355725c03084500247141749a752c23fa0790d Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 30 May 2019 18:39:17 +0800 -Subject: [PATCH 27/37] bcache: add code comments for journal_read_bucket() - -This patch adds more code comments in journal_read_bucket(), this is an -effort to make the code to be more understandable. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ - 1 file changed, 24 insertions(+) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 54f8886b6177..98ee467ec3f7 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; - - blocks = set_blocks(j, block_bytes(ca->set)); - -+ /* -+ * Nodes in 'list' are in linear increasing order of -+ * i->j.seq, the node on head has the smallest (oldest) -+ * journal seq, the node on tail has the biggest -+ * (latest) journal seq. -+ */ -+ -+ /* -+ * Check from the oldest jset for last_seq. If -+ * i->j.seq < j->last_seq, it means the oldest jset -+ * in list is expired and useless, remove it from -+ * this list. Otherwise, j is a condidate jset for -+ * further following checks. -+ */ - while (!list_empty(list)) { - i = list_first_entry(list, - struct journal_replay, list); -@@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; - kfree(i); - } - -+ /* iterate list in reverse order (from latest jset) */ - list_for_each_entry_reverse(i, list, list) { - if (j->seq == i->j.seq) - goto next_set; - -+ /* -+ * if j->seq is less than any i->j.last_seq -+ * in list, j is an expired and useless jset. -+ */ - if (j->seq < i->j.last_seq) - goto next_set; - -+ /* -+ * 'where' points to first jset in list which -+ * is elder then j. -+ */ - if (j->seq > i->j.seq) { - where = &i->list; - goto add; -@@ -129,6 +152,7 @@ reread: left = ca->sb.bucket_size - offset; - if (!i) - return -ENOMEM; - memcpy(&i->j, j, bytes); -+ /* Add to the location after 'where' points to */ - list_add(&i->list, where); - ret = 1; - --- -2.16.4 - diff --git a/for-current/0028-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch b/for-current/0028-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch deleted file mode 100644 index 419dbe5..0000000 --- a/for-current/0028-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 3221dc7b0f4d0dff70943b8a9a600ee5bfd17e53 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 30 May 2019 18:40:37 +0800 -Subject: [PATCH 28/37] bcache: set largest seq to ja->seq[bucket_index] in - journal_read_bucket() - -In journal_read_bucket() when setting ja->seq[bucket_index], there might -be potential case that a later non-maximum overwrites a better sequence -number to ja->seq[bucket_index]. This patch adds a check to make sure -that ja->seq[bucket_index] will be only set a new value if it is bigger -then current value. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 98ee467ec3f7..3d321bffddc9 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -156,7 +156,8 @@ reread: left = ca->sb.bucket_size - offset; - list_add(&i->list, where); - ret = 1; - -- ja->seq[bucket_index] = j->seq; -+ if (j->seq > ja->seq[bucket_index]) -+ ja->seq[bucket_index] = j->seq; - next_set: - offset += blocks * ca->sb.block_size; - len -= blocks * ca->sb.block_size; --- -2.16.4 - diff --git a/for-current/0029-bcache-shrink-btree-node-cache-after-bch_btree_check.patch b/for-current/0029-bcache-shrink-btree-node-cache-after-bch_btree_check.patch deleted file mode 100644 index 4fcc994..0000000 --- a/for-current/0029-bcache-shrink-btree-node-cache-after-bch_btree_check.patch +++ /dev/null @@ -1,55 +0,0 @@ -From a041fd83337fb5ca0a1a55103c3f0f057d0980f3 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 31 May 2019 17:29:56 +0800 -Subject: [PATCH 29/37] bcache: shrink btree node cache after bch_btree_check() - -When cache set starts, bch_btree_check() will check all bkeys on cache -device by calculating the checksum. This operation will consume a huge -number of system memory if there are a lot of data cached. Since bcache -uses its own mca cache to maintain all its read-in btree nodes, and only -releases the cache space when system memory manage code starts to shrink -caches. Then before memory manager code to call the mca cache shrinker -callback, bcache mca cache will compete memory resource with user space -application, which may have nagive effect to performance of user space -workloads (e.g. data base, or I/O service of distributed storage node). - -This patch tries to call bcache mca shrinker routine to proactively -release mca cache memory, to decrease the memory pressure of system and -avoid negative effort of the overall system I/O performance. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 17 +++++++++++++++++ - 1 file changed, 17 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a8ea4e2086a9..26e374fbf57c 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1880,6 +1880,23 @@ static int run_cache_set(struct cache_set *c) - if (bch_btree_check(c)) - goto err; - -+ /* -+ * bch_btree_check() may occupy too much system memory which -+ * has negative effects to user space application (e.g. data -+ * base) performance. Shrink the mca cache memory proactively -+ * here to avoid competing memory with user space workloads.. -+ */ -+ if (!c->shrinker_disabled) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = c->btree_cache_used * c->btree_pages; -+ /* first run to clear b->accessed tag */ -+ c->shrink.scan_objects(&c->shrink, &sc); -+ /* second run to reap non-accessed nodes */ -+ c->shrink.scan_objects(&c->shrink, &sc); -+ } -+ - bch_journal_mark(c, &journal); - bch_initial_gc_finish(c); - pr_debug("btree_check() done"); --- -2.16.4 - diff --git a/for-current/0030-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch b/for-current/0030-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch deleted file mode 100644 index 9f65feb..0000000 --- a/for-current/0030-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 73177d7836a9f472451c15b4498e7e0b79c46908 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 28 May 2019 21:36:56 +0800 -Subject: [PATCH 30/37] bcache: Revert "bcache: free heap - cache_set->flush_btree in bch_journal_free" - -This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. - -This patch depends on commit c4dc2497d50d ("bcache: fix high CPU -occupancy during journal") which is reverted in previous patch. So -revert this one too. - -Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org -Cc: Shenghui Wang <shhuiw@foxmail.com> ---- - drivers/md/bcache/journal.c | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 3d321bffddc9..11d8c93b88bb 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -884,7 +884,6 @@ void bch_journal_free(struct cache_set *c) - free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); - free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); - free_fifo(&c->journal.pin); -- free_heap(&c->flush_btree); - } - - int bch_journal_alloc(struct cache_set *c) --- -2.16.4 - diff --git a/for-current/0031-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch b/for-current/0031-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch deleted file mode 100644 index 814d7f1..0000000 --- a/for-current/0031-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch +++ /dev/null @@ -1,129 +0,0 @@ -From c738cc581e2658874876f29c5db4abd2fbcbfd4e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 28 May 2019 21:19:38 +0800 -Subject: [PATCH 31/37] bcache: Revert "bcache: fix high CPU occupancy during - journal" - -This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. - -This patch enlarges a race between normal btree flush code path and -flush_btree_write(), which causes deadlock when journal space is -exhausted. Reverts this patch makes the race window from 128 btree -nodes to only 1 btree nodes. - -Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") -Signed-off-by: Coly Li <colyli@suse.de> -Cc: stable@vger.kernel.org -Cc: Tang Junhui <tang.junhui.linux@gmail.com> ---- - drivers/md/bcache/bcache.h | 2 -- - drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------ - drivers/md/bcache/util.h | 2 -- - 3 files changed, 15 insertions(+), 36 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 73a97586a2ef..cb268d7c6cea 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -726,8 +726,6 @@ struct cache_set { - - #define BUCKET_HASH_BITS 12 - struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; -- -- DECLARE_HEAP(struct btree *, flush_btree); - }; - - struct bbio { -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 11d8c93b88bb..14a4e2c44de9 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -416,12 +416,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - } - - /* Journalling */ --#define journal_max_cmp(l, r) \ -- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ -- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) --#define journal_min_cmp(l, r) \ -- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ -- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) - - static void btree_flush_write(struct cache_set *c) - { -@@ -429,35 +423,25 @@ static void btree_flush_write(struct cache_set *c) - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ -- struct btree *b; -- int i; -+ struct btree *b, *best; -+ unsigned int i; - - atomic_long_inc(&c->flush_write); -- - retry: -- spin_lock(&c->journal.lock); -- if (heap_empty(&c->flush_btree)) { -- for_each_cached_btree(b, c, i) -- if (btree_current_write(b)->journal) { -- if (!heap_full(&c->flush_btree)) -- heap_add(&c->flush_btree, b, -- journal_max_cmp); -- else if (journal_max_cmp(b, -- heap_peek(&c->flush_btree))) { -- c->flush_btree.data[0] = b; -- heap_sift(&c->flush_btree, 0, -- journal_max_cmp); -- } -+ best = NULL; -+ -+ for_each_cached_btree(b, c, i) -+ if (btree_current_write(b)->journal) { -+ if (!best) -+ best = b; -+ else if (journal_pin_cmp(c, -+ btree_current_write(best)->journal, -+ btree_current_write(b)->journal)) { -+ best = b; - } -+ } - -- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) -- heap_sift(&c->flush_btree, i, journal_min_cmp); -- } -- -- b = NULL; -- heap_pop(&c->flush_btree, b, journal_min_cmp); -- spin_unlock(&c->journal.lock); -- -+ b = best; - if (b) { - mutex_lock(&b->write_lock); - if (!btree_current_write(b)->journal) { -@@ -898,8 +882,7 @@ int bch_journal_alloc(struct cache_set *c) - j->w[0].c = c; - j->w[1].c = c; - -- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || -- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) - return -ENOMEM; -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 1fbced94e4cc..c029f7443190 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -113,8 +113,6 @@ do { \ - - #define heap_full(h) ((h)->used == (h)->size) - --#define heap_empty(h) ((h)->used == 0) -- - #define DECLARE_FIFO(type, name) \ - struct { \ - size_t front, back, size, mask; \ --- -2.16.4 - diff --git a/for-current/0032-bcache-only-clear-BTREE_NODE_dirty-bit-when-it-is-se.patch b/for-current/0032-bcache-only-clear-BTREE_NODE_dirty-bit-when-it-is-se.patch deleted file mode 100644 index 40c8ba0..0000000 --- a/for-current/0032-bcache-only-clear-BTREE_NODE_dirty-bit-when-it-is-se.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 6426472bb2a01a472329b9399df2c30ec4c7fce8 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 24 Jun 2019 15:31:57 +0800 -Subject: [PATCH 32/37] bcache: only clear BTREE_NODE_dirty bit when it is set - -In bch_btree_cache_free() and btree_node_free(), BTREE_NODE_dirty is -always set no matter btree node is dirty or not. The code looks like -this, - if (btree_node_dirty(b)) - btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); - -Indeed if btree_node_dirty(b) returns false, it means BTREE_NODE_dirty -bit is cleared, then it is unnecessary to clear the bit again. - -This patch only clears BTREE_NODE_dirty when btree_node_dirty(b) is -true (the bit is set), to save a few CPU cycles. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/btree.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index cf38a1b031fa..88e5aa3fbb07 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -782,10 +782,10 @@ void bch_btree_cache_free(struct cache_set *c) - while (!list_empty(&c->btree_cache)) { - b = list_first_entry(&c->btree_cache, struct btree, list); - -- if (btree_node_dirty(b)) -+ if (btree_node_dirty(b)) { - btree_complete_write(b, btree_current_write(b)); -- clear_bit(BTREE_NODE_dirty, &b->flags); -- -+ clear_bit(BTREE_NODE_dirty, &b->flags); -+ } - mca_data_free(b); - } - -@@ -1073,9 +1073,10 @@ static void btree_node_free(struct btree *b) - - mutex_lock(&b->write_lock); - -- if (btree_node_dirty(b)) -+ if (btree_node_dirty(b)) { - btree_complete_write(b, btree_current_write(b)); -- clear_bit(BTREE_NODE_dirty, &b->flags); -+ clear_bit(BTREE_NODE_dirty, &b->flags); -+ } - - mutex_unlock(&b->write_lock); - --- -2.16.4 - diff --git a/for-current/0033-bcache-add-comments-for-mutex_lock-b-write_lock.patch b/for-current/0033-bcache-add-comments-for-mutex_lock-b-write_lock.patch deleted file mode 100644 index 8a15da2..0000000 --- a/for-current/0033-bcache-add-comments-for-mutex_lock-b-write_lock.patch +++ /dev/null @@ -1,47 +0,0 @@ -From b3bb7eb417b3c4efb4241f1a940af3da1763dcdb Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Mon, 24 Jun 2019 16:10:55 +0800 -Subject: [PATCH 33/37] bcache: add comments for mutex_lock(&b->write_lock) - -When accessing or modifying BTREE_NODE_dirty bit, it is not always -necessary to acquire b->write_lock. In bch_btree_cache_free() and -mca_reap() acquiring b->write_lock is necessary, and this patch adds -comments to explain why mutex_lock(&b->write_lock) is necessary for -checking or clearing BTREE_NODE_dirty bit there. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/btree.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 88e5aa3fbb07..846306c3a887 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -659,6 +659,11 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) - up(&b->io_mutex); - } - -+ /* -+ * BTREE_NODE_dirty might be cleared in btree_flush_btree() by -+ * __bch_btree_node_write(). To avoid an extra flush, acquire -+ * b->write_lock before checking BTREE_NODE_dirty bit. -+ */ - mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) - __bch_btree_node_write(b, &cl); -@@ -782,6 +787,11 @@ void bch_btree_cache_free(struct cache_set *c) - while (!list_empty(&c->btree_cache)) { - b = list_first_entry(&c->btree_cache, struct btree, list); - -+ /* -+ * This function is called by cache_set_free(), no I/O -+ * request on cache now, it is unnecessary to acquire -+ * b->write_lock before clearing BTREE_NODE_dirty anymore. -+ */ - if (btree_node_dirty(b)) { - btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); --- -2.16.4 - diff --git a/for-current/0034-bcache-remove-retry_flush_write-from-struct-cache_se.patch b/for-current/0034-bcache-remove-retry_flush_write-from-struct-cache_se.patch deleted file mode 100644 index 0f14758..0000000 --- a/for-current/0034-bcache-remove-retry_flush_write-from-struct-cache_se.patch +++ /dev/null @@ -1,75 +0,0 @@ -From 6c7913bbc396a830cd06017eb2ea570fad187fba Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 1 Jun 2019 01:58:23 +0800 -Subject: [PATCH 34/37] bcache: remove retry_flush_write from struct cache_set - -In struct cache_set, retry_flush_write is added for commit c4dc2497d50d -("bcache: fix high CPU occupancy during journal") which is reverted in -previous patch. - -Now it is useless anymore, and this patch removes it from bcache code. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 - - drivers/md/bcache/journal.c | 1 - - drivers/md/bcache/sysfs.c | 5 ----- - 3 files changed, 7 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index cb268d7c6cea..35396248a7d5 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -706,7 +706,6 @@ struct cache_set { - - atomic_long_t reclaim; - atomic_long_t flush_write; -- atomic_long_t retry_flush_write; - - enum { - ON_ERROR_UNREGISTER, -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 14a4e2c44de9..1218e3cada3c 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -447,7 +447,6 @@ static void btree_flush_write(struct cache_set *c) - if (!btree_current_write(b)->journal) { - mutex_unlock(&b->write_lock); - /* We raced */ -- atomic_long_inc(&c->retry_flush_write); - goto retry; - } - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index d62e28643109..701a386a954c 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -83,7 +83,6 @@ read_attribute(state); - read_attribute(cache_read_races); - read_attribute(reclaim); - read_attribute(flush_write); --read_attribute(retry_flush_write); - read_attribute(writeback_keys_done); - read_attribute(writeback_keys_failed); - read_attribute(io_errors); -@@ -709,9 +708,6 @@ SHOW(__bch_cache_set) - sysfs_print(flush_write, - atomic_long_read(&c->flush_write)); - -- sysfs_print(retry_flush_write, -- atomic_long_read(&c->retry_flush_write)); -- - sysfs_print(writeback_keys_done, - atomic_long_read(&c->writeback_keys_done)); - sysfs_print(writeback_keys_failed, -@@ -936,7 +932,6 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_cache_read_races, - &sysfs_reclaim, - &sysfs_flush_write, -- &sysfs_retry_flush_write, - &sysfs_writeback_keys_done, - &sysfs_writeback_keys_failed, - --- -2.16.4 - diff --git a/for-current/0035-bcache-fix-race-in-btree_flush_write.patch b/for-current/0035-bcache-fix-race-in-btree_flush_write.patch deleted file mode 100644 index db50797..0000000 --- a/for-current/0035-bcache-fix-race-in-btree_flush_write.patch +++ /dev/null @@ -1,186 +0,0 @@ -From e79b0a3af2cad623846e90c46964761457d57741 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 27 Jun 2019 21:28:43 +0800 -Subject: [PATCH 35/37] bcache: fix race in btree_flush_write() - -There is a race between mca_reap(), btree_node_free() and journal code -btree_flush_write(), which results very rare and strange deadlock or -panic and are very hard to reproduce. - -Let me explain how the race happens. In btree_flush_write() one btree -node with oldest journal pin is selected, then it is flushed to cache -device, the select-and-flush is a two steps operation. Between these two -steps, there are something may happen inside the race window, -- The selected btree node was reaped by mca_reap() and allocated to - other requesters for other btree node. -- The slected btree node was selected, flushed and released by mca - shrink callback bch_mca_scan(). -When btree_flush_write() tries to flush the selected btree node, firstly -b->write_lock is held by mutex_lock(). If the race happens and the -memory of selected btree node is allocated to other btree node, if that -btree node's write_lock is held already, a deadlock very probably -happens here. A worse case is the memory of the selected btree node is -released, then all references to this btree node (e.g. b->write_lock) -will trigger NULL pointer deference panic. - -This race was introduced in commit cafe56359144 ("bcache: A block layer -cache"), and enlarged by commit c4dc2497d50d ("bcache: fix high CPU -occupancy during journal"), which selected 128 btree nodes and flushed -them one-by-one in a quite long time period. - -Such race is not easy to reproduce before. On a Lenovo SR650 server with -48 Xeon cores, and configure 1 NVMe SSD as cache device, a MD raid0 -device assembled by 3 NVMe SSDs as backing device, this race can be -observed around every 10,000 times btree_flush_write() gets called. Both -deadlock and kernel panic all happened as aftermath of the race. - -The idea of the fix is to add a btree flag BTREE_NODE_journal_flush. It -is set when selecting btree nodes, and cleared after btree nodes -flushed. Then when mca_reap() selects a btree node with this bit set, -this btree node will be skipped. Since mca_reap() only reaps btree node -without BTREE_NODE_journal_flush flag, such race is avoided. - -Once corner case should be noticed, that is btree_node_free(). It might -be called in some error handling code path. For example the following -code piece from btree_split(), - 2149 err_free2: - 2150 bkey_put(b->c, &n2->key); - 2151 btree_node_free(n2); - 2152 rw_unlock(true, n2); - 2153 err_free1: - 2154 bkey_put(b->c, &n1->key); - 2155 btree_node_free(n1); - 2156 rw_unlock(true, n1); -At line 2151 and 2155, the btree node n2 and n1 are released without -mac_reap(), so BTREE_NODE_journal_flush also needs to be checked here. -If btree_node_free() is called directly in such error handling path, -and the selected btree node has BTREE_NODE_journal_flush bit set, just -delay for 1 us and retry again. In this case this btree node won't -be skipped, just retry until the BTREE_NODE_journal_flush bit cleared, -and free the btree node memory. - -Fixes: cafe56359144 ("bcache: A block layer cache") -Signed-off-by: Coly Li <colyli@suse.de> -Reported-and-tested-by: kbuild test robot <lkp@intel.com> -Cc: stable@vger.kernel.org ---- - drivers/md/bcache/btree.c | 28 +++++++++++++++++++++++++++- - drivers/md/bcache/btree.h | 2 ++ - drivers/md/bcache/journal.c | 7 +++++++ - 3 files changed, 36 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 846306c3a887..ba434d9ac720 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -35,7 +35,7 @@ - #include <linux/rcupdate.h> - #include <linux/sched/clock.h> - #include <linux/rculist.h> -- -+#include <linux/delay.h> - #include <trace/events/bcache.h> - - /* -@@ -659,12 +659,25 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) - up(&b->io_mutex); - } - -+retry: - /* - * BTREE_NODE_dirty might be cleared in btree_flush_btree() by - * __bch_btree_node_write(). To avoid an extra flush, acquire - * b->write_lock before checking BTREE_NODE_dirty bit. - */ - mutex_lock(&b->write_lock); -+ /* -+ * If this btree node is selected in btree_flush_write() by journal -+ * code, delay and retry until the node is flushed by journal code -+ * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). -+ */ -+ if (btree_node_journal_flush(b)) { -+ pr_debug("bnode %p is flushing by journal, retry", b); -+ mutex_unlock(&b->write_lock); -+ udelay(1); -+ goto retry; -+ } -+ - if (btree_node_dirty(b)) - __bch_btree_node_write(b, &cl); - mutex_unlock(&b->write_lock); -@@ -1081,7 +1094,20 @@ static void btree_node_free(struct btree *b) - - BUG_ON(b == b->c->root); - -+retry: - mutex_lock(&b->write_lock); -+ /* -+ * If the btree node is selected and flushing in btree_flush_write(), -+ * delay and retry until the BTREE_NODE_journal_flush bit cleared, -+ * then it is safe to free the btree node here. Otherwise this btree -+ * node will be in race condition. -+ */ -+ if (btree_node_journal_flush(b)) { -+ mutex_unlock(&b->write_lock); -+ pr_debug("bnode %p journal_flush set, retry", b); -+ udelay(1); -+ goto retry; -+ } - - if (btree_node_dirty(b)) { - btree_complete_write(b, btree_current_write(b)); -diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h -index d1c72ef64edf..76cfd121a486 100644 ---- a/drivers/md/bcache/btree.h -+++ b/drivers/md/bcache/btree.h -@@ -158,11 +158,13 @@ enum btree_flags { - BTREE_NODE_io_error, - BTREE_NODE_dirty, - BTREE_NODE_write_idx, -+ BTREE_NODE_journal_flush, - }; - - BTREE_FLAG(io_error); - BTREE_FLAG(dirty); - BTREE_FLAG(write_idx); -+BTREE_FLAG(journal_flush); - - static inline struct btree_write *btree_current_write(struct btree *b) - { -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 1218e3cada3c..a1e3e1fcea6e 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -430,6 +430,7 @@ static void btree_flush_write(struct cache_set *c) - retry: - best = NULL; - -+ mutex_lock(&c->bucket_lock); - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) -@@ -442,15 +443,21 @@ static void btree_flush_write(struct cache_set *c) - } - - b = best; -+ if (b) -+ set_btree_node_journal_flush(b); -+ mutex_unlock(&c->bucket_lock); -+ - if (b) { - mutex_lock(&b->write_lock); - if (!btree_current_write(b)->journal) { -+ clear_bit(BTREE_NODE_journal_flush, &b->flags); - mutex_unlock(&b->write_lock); - /* We raced */ - goto retry; - } - - __bch_btree_node_write(b, NULL); -+ clear_bit(BTREE_NODE_journal_flush, &b->flags); - mutex_unlock(&b->write_lock); - } - } --- -2.16.4 - diff --git a/for-current/0036-bcache-performance-improvement-for-btree_flush_write.patch b/for-current/0036-bcache-performance-improvement-for-btree_flush_write.patch deleted file mode 100644 index e7d5207..0000000 --- a/for-current/0036-bcache-performance-improvement-for-btree_flush_write.patch +++ /dev/null @@ -1,187 +0,0 @@ -From f2b6d7b2245938b2f08daa7c7f498e439e7ae176 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 27 Jun 2019 23:07:22 +0800 -Subject: [PATCH 36/37] bcache: performance improvement for btree_flush_write() - -This patch improves performance for btree_flush_write() in following -ways, -- Use another spinlock journal.flush_write_lock to replace the very - hot journal.lock. We don't have to use journal.lock here, selecting - candidate btree nodes takes a lot of time, hold journal.lock here will - block other jouranling threads and drop the overall I/O performance. -- Only select flushing btree node from c->btree_cache list. When the - machine has a large system memory, mca cache may have a huge number of - cached btree nodes. Iterating all the cached nodes will take a lot - of CPU time, and most of the nodes on c->btree_cache_freeable and - c->btree_cache_freed lists are cleared and have need to flush. So only - travel mca list c->btree_cache to select flushing btree node should be - enough for most of the cases. -- Don't iterate whole c->btree_cache list, only reversely select first - BTREE_FLUSH_NR btree nodes to flush. Iterate all btree nodes from - c->btree_cache and select the oldest journal pin btree nodes consumes - huge number of CPU cycles if the list is huge (push and pop a node - into/out of a heap is expensive). The last several dirty btree nodes - on the tail of c->btree_cache list are earlest allocated and cached - btree nodes, they are relative to the oldest journal pin btree nodes. - Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of - c->btree_cache probably includes the oldest journal pin btree nodes. - -In my testing, the above change decreases 50%+ CPU consumption when -journal space is full. Some times IOPS drops to 0 for 5-8 seconds, -comparing blocking I/O for 120+ seconds in previous code, this is much -better. Maybe there is room to improve in future, but at this momment -the fix looks fine and performs well in my testing. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 85 +++++++++++++++++++++++++++++++++------------ - drivers/md/bcache/journal.h | 4 +++ - 2 files changed, 67 insertions(+), 22 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a1e3e1fcea6e..8bcd8f1bf8cb 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -419,47 +419,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - - static void btree_flush_write(struct cache_set *c) - { -- /* -- * Try to find the btree node with that references the oldest journal -- * entry, best is our current candidate and is locked if non NULL: -- */ -- struct btree *b, *best; -- unsigned int i; -+ struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; -+ unsigned int i, n; -+ -+ if (c->journal.btree_flushing) -+ return; -+ -+ spin_lock(&c->journal.flush_write_lock); -+ if (c->journal.btree_flushing) { -+ spin_unlock(&c->journal.flush_write_lock); -+ return; -+ } -+ c->journal.btree_flushing = true; -+ spin_unlock(&c->journal.flush_write_lock); - - atomic_long_inc(&c->flush_write); --retry: -- best = NULL; -+ memset(btree_nodes, 0, sizeof(btree_nodes)); -+ n = 0; - - mutex_lock(&c->bucket_lock); -- for_each_cached_btree(b, c, i) -- if (btree_current_write(b)->journal) { -- if (!best) -- best = b; -- else if (journal_pin_cmp(c, -- btree_current_write(best)->journal, -- btree_current_write(b)->journal)) { -- best = b; -- } -+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { -+ if (btree_node_journal_flush(b)) -+ pr_err("BUG: flush_write bit should not be set here!"); -+ -+ mutex_lock(&b->write_lock); -+ -+ if (!btree_node_dirty(b)) { -+ mutex_unlock(&b->write_lock); -+ continue; -+ } -+ -+ if (!btree_current_write(b)->journal) { -+ mutex_unlock(&b->write_lock); -+ continue; - } - -- b = best; -- if (b) - set_btree_node_journal_flush(b); -+ -+ mutex_unlock(&b->write_lock); -+ -+ btree_nodes[n++] = b; -+ if (n == BTREE_FLUSH_NR) -+ break; -+ } - mutex_unlock(&c->bucket_lock); - -- if (b) { -+ for (i = 0; i < n; i++) { -+ b = btree_nodes[i]; -+ if (!b) { -+ pr_err("BUG: btree_nodes[%d] is NULL", i); -+ continue; -+ } -+ -+ /* safe to check without holding b->write_lock */ -+ if (!btree_node_journal_flush(b)) { -+ pr_err("BUG: bnode %p: journal_flush bit cleaned", b); -+ continue; -+ } -+ - mutex_lock(&b->write_lock); - if (!btree_current_write(b)->journal) { - clear_bit(BTREE_NODE_journal_flush, &b->flags); - mutex_unlock(&b->write_lock); -- /* We raced */ -- goto retry; -+ pr_debug("bnode %p: written by others", b); -+ continue; -+ } -+ -+ if (!btree_node_dirty(b)) { -+ clear_bit(BTREE_NODE_journal_flush, &b->flags); -+ mutex_unlock(&b->write_lock); -+ pr_debug("bnode %p: dirty bit cleaned by others", b); -+ continue; - } - - __bch_btree_node_write(b, NULL); - clear_bit(BTREE_NODE_journal_flush, &b->flags); - mutex_unlock(&b->write_lock); - } -+ -+ spin_lock(&c->journal.flush_write_lock); -+ c->journal.btree_flushing = false; -+ spin_unlock(&c->journal.flush_write_lock); - } - - #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -@@ -881,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c) - struct journal *j = &c->journal; - - spin_lock_init(&j->lock); -+ spin_lock_init(&j->flush_write_lock); - INIT_DELAYED_WORK(&j->work, journal_write_work); - - c->journal_delay_ms = 100; -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 66f0facff84b..f2ea34d5f431 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -103,6 +103,8 @@ struct journal_write { - /* Embedded in struct cache_set */ - struct journal { - spinlock_t lock; -+ spinlock_t flush_write_lock; -+ bool btree_flushing; - /* used when waiting because the journal was full */ - struct closure_waitlist wait; - struct closure io; -@@ -154,6 +156,8 @@ struct journal_device { - struct bio_vec bv[8]; - }; - -+#define BTREE_FLUSH_NR 8 -+ - #define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) - --- -2.16.4 - diff --git a/for-current/0037-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch b/for-current/0037-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch deleted file mode 100644 index efc95f0..0000000 --- a/for-current/0037-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch +++ /dev/null @@ -1,80 +0,0 @@ -From ea4bf18c9eb2ef705dce00b1bc5fde2f49ef2740 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 2 Jun 2019 00:47:23 +0800 -Subject: [PATCH 37/37] bcache: add reclaimed_journal_buckets to struct - cache_set - -Now we have counters for how many times jouranl is reclaimed, how many -times cached dirty btree nodes are flushed, but we don't know how many -jouranl buckets are really reclaimed. - -This patch adds reclaimed_journal_buckets into struct cache_set, this -is an increasing only counter, to tell how many journal buckets are -reclaimed since cache set runs. From all these three counters (reclaim, -reclaimed_journal_buckets, flush_write), we can have idea how well -current journal space reclaim code works. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 1 + - drivers/md/bcache/journal.c | 1 + - drivers/md/bcache/sysfs.c | 5 +++++ - 3 files changed, 7 insertions(+) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 35396248a7d5..013e35a9e317 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -705,6 +705,7 @@ struct cache_set { - atomic_long_t writeback_keys_failed; - - atomic_long_t reclaim; -+ atomic_long_t reclaimed_journal_buckets; - atomic_long_t flush_write; - - enum { -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 8bcd8f1bf8cb..be2a2a201603 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) - k->ptr[n++] = MAKE_PTR(0, - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), - ca->sb.nr_this_dev); -+ atomic_long_inc(&c->reclaimed_journal_buckets); - } - - if (n) { -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 701a386a954c..9f0826712845 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -82,6 +82,7 @@ read_attribute(bset_tree_stats); - read_attribute(state); - read_attribute(cache_read_races); - read_attribute(reclaim); -+read_attribute(reclaimed_journal_buckets); - read_attribute(flush_write); - read_attribute(writeback_keys_done); - read_attribute(writeback_keys_failed); -@@ -705,6 +706,9 @@ SHOW(__bch_cache_set) - sysfs_print(reclaim, - atomic_long_read(&c->reclaim)); - -+ sysfs_print(reclaimed_journal_buckets, -+ atomic_long_read(&c->reclaimed_journal_buckets)); -+ - sysfs_print(flush_write, - atomic_long_read(&c->flush_write)); - -@@ -931,6 +935,7 @@ static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_bset_tree_stats, - &sysfs_cache_read_races, - &sysfs_reclaim, -+ &sysfs_reclaimed_journal_buckets, - &sysfs_flush_write, - &sysfs_writeback_keys_done, - &sysfs_writeback_keys_failed, --- -2.16.4 - diff --git a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch b/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch new file mode 100644 index 0000000..edd8fb6 --- /dev/null +++ b/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch @@ -0,0 +1,29 @@ +From: Shile Zhang <shile.zhang@linux.alibaba.com> +Date: Thu, 15 Aug 2019 00:51:51 +0800 +Subject: [PATCH] bcache: add cond_resched() in __bch_cache_cmp() + +Read /sys/fs/bcache/<uuid>/cacheN/priority_stats can take very long +time with huge cache after long run. + +Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com> +Tested-by: Heitor Alves de Siqueira <halves@canonical.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/sysfs.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 9f0826712845..6b29e34acf7a 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -960,6 +960,7 @@ KTYPE(bch_cache_set_internal); + + static int __bch_cache_cmp(const void *l, const void *r) + { ++ cond_resched(); + return *((uint16_t *)r) - *((uint16_t *)l); + } + +-- +2.16.4 + diff --git a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch b/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch new file mode 100644 index 0000000..44096e4 --- /dev/null +++ b/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch @@ -0,0 +1,35 @@ +From 3c3c34a87be58548a302573dbe32b518f047db09 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet <kent.overstreet@gmail.com> +Date: Mon, 10 Jun 2019 15:14:20 -0400 +Subject: [PATCH] closures: fix a race on wakeup from closure_sync + +Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> +Acked-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/closure.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c +index 73f5319295bc..c12cd809ab19 100644 +--- a/drivers/md/bcache/closure.c ++++ b/drivers/md/bcache/closure.c +@@ -105,8 +105,14 @@ struct closure_syncer { + + static void closure_sync_fn(struct closure *cl) + { +- cl->s->done = 1; +- wake_up_process(cl->s->task); ++ struct closure_syncer *s = cl->s; ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = READ_ONCE(s->task); ++ s->done = 1; ++ wake_up_process(p); ++ rcu_read_unlock(); + } + + void __sched __closure_sync(struct closure *cl) +-- +2.16.4 + diff --git a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch new file mode 100644 index 0000000..81646a5 --- /dev/null +++ b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch @@ -0,0 +1,149 @@ +From fb31000daa352493b206064a5e8c4bcaac0e6b6c Mon Sep 17 00:00:00 2001 +From: Andrea Righi <andrea.righi@canonical.com> +Date: Wed, 7 Aug 2019 12:38:06 +0200 +Subject: [PATCH] bcache: fix deadlock in bcache_allocator + +bcache_allocator can call the following: + + bch_allocator_thread() + -> bch_prio_write() + -> bch_bucket_alloc() + -> wait on &ca->set->bucket_wait + +But the wake up event on bucket_wait is supposed to come from +bch_allocator_thread() itself => deadlock: + +[ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds. +[ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232 +[ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[ 1158.504413] bcache_allocato D 0 15861 2 0x80004000 +[ 1158.504419] Call Trace: +[ 1158.504429] __schedule+0x2a8/0x670 +[ 1158.504432] schedule+0x2d/0x90 +[ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache] +[ 1158.504453] ? wait_woken+0x80/0x80 +[ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache] +[ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache] +[ 1158.504491] kthread+0x121/0x140 +[ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache] +[ 1158.504506] ? kthread_park+0xb0/0xb0 +[ 1158.504510] ret_from_fork+0x35/0x40 + +Fix by making the call to bch_prio_write() non-blocking, so that +bch_allocator_thread() never waits on itself. + +Moreover, make sure to wake up the garbage collector thread when +bch_prio_write() is failing to allocate buckets. + +BugLink: https://bugs.launchpad.net/bugs/1784665 +BugLink: https://bugs.launchpad.net/bugs/1796292 +Signed-off-by: Andrea Righi <andrea.righi@canonical.com> +--- + drivers/md/bcache/alloc.c | 5 ++++- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 27 +++++++++++++++++++++------ + 3 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index 6f776823b9ba..a1df0d95151c 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -377,7 +377,10 @@ static int bch_allocator_thread(void *arg) + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + +- bch_prio_write(ca); ++ if (bch_prio_write(ca, false) < 0) { ++ ca->invalidate_needs_gc = 1; ++ wake_up_gc(ca->set); ++ } + } + } + out: +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..deb924e1d790 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -977,7 +977,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); + __printf(2, 3) + bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); + +-void bch_prio_write(struct cache *ca); ++int bch_prio_write(struct cache *ca, bool wait); + void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); + + extern struct workqueue_struct *bcache_wq; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 20ed838e9413..bd153234290d 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -529,12 +529,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, + closure_sync(cl); + } + +-void bch_prio_write(struct cache *ca) ++int bch_prio_write(struct cache *ca, bool wait) + { + int i; + struct bucket *b; + struct closure cl; + ++ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", ++ fifo_used(&ca->free[RESERVE_PRIO]), ++ fifo_used(&ca->free[RESERVE_NONE]), ++ fifo_used(&ca->free_inc)); ++ ++ /* ++ * Pre-check if there are enough free buckets. In the non-blocking ++ * scenario it's better to fail early rather than starting to allocate ++ * buckets and do a cleanup later in case of failure. ++ */ ++ if (!wait) { ++ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + ++ fifo_used(&ca->free[RESERVE_NONE]); ++ if (prio_buckets(ca) > avail) ++ return -ENOMEM; ++ } ++ + closure_init_stack(&cl); + + lockdep_assert_held(&ca->set->bucket_lock); +@@ -544,9 +561,6 @@ void bch_prio_write(struct cache *ca) + atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + +- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), +- // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); +- + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + long bucket; + struct prio_set *p = ca->disk_buckets; +@@ -564,7 +578,7 @@ void bch_prio_write(struct cache *ca) + p->magic = pset_magic(&ca->sb); + p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + +- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); ++ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); + BUG_ON(bucket == -1); + + mutex_unlock(&ca->set->bucket_lock); +@@ -593,6 +607,7 @@ void bch_prio_write(struct cache *ca) + + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } ++ return 0; + } + + static void prio_read(struct cache *ca, uint64_t bucket) +@@ -1954,7 +1969,7 @@ static int run_cache_set(struct cache_set *c) + + mutex_lock(&c->bucket_lock); + for_each_cache(ca, c, i) +- bch_prio_write(ca); ++ bch_prio_write(ca, true); + mutex_unlock(&c->bucket_lock); + + err = "cannot allocate new UUID bucket"; +-- +2.16.4 + diff --git a/for-test/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch b/for-test/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch new file mode 100644 index 0000000..3f8238f --- /dev/null +++ b/for-test/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch @@ -0,0 +1,110 @@ +From db1fb64ff29474b18e07a7a3887e326dd2b891b5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 11 Jul 2019 22:56:40 +0800 +Subject: [PATCH] bcache: introduce btree_cache_total_pages into struct + cache_set + +A new member "atomic_long_t btree_cache_total_pages" is added into +struct cache_set, to record total page numbers occupied by bcache +internal btree node cache. + +When mca_data_alloc() is called to allocate pages for btree node cache, +the allocated pages number is added to btree_cache_total_pages. When +mca_data_free() is called to free pages of a btree node cache, the +freed pages numbeer is subtracted from btree_cache_total_pages. + +Then in sysfs.c:bch_cache_size(), when calculating the total pages +occupied by bcache btree node cache, it is unncessary to iterate list +c->btree_cache, c->btree_cache_total_pages can be directly used now. + +Now reading /sys/fs/bcache/<cache set UUID>/btree_cache_size is faster, +and this patch is also a preparation to limit pages consumption by the +bcache internal btree node cache. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/btree.c | 12 ++++++++---- + drivers/md/bcache/super.c | 1 + + drivers/md/bcache/sysfs.c | 9 ++------- + 4 files changed, 12 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..fe5ff867725c 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -570,6 +570,7 @@ struct cache_set { + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; ++ atomic_long_t btree_cache_total_pages; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned int btree_cache_used; +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ba434d9ac720..df800e3e2dba 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -578,6 +578,8 @@ static void mca_data_free(struct btree *b) + + bch_btree_keys_free(&b->keys); + ++ atomic_long_sub(1<<b->keys.page_order, ++ &b->c->btree_cache_total_pages); + b->c->btree_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); + } +@@ -598,11 +600,13 @@ static unsigned int btree_order(struct bkey *k) + + static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) + { +- if (!bch_btree_keys_alloc(&b->keys, +- max_t(unsigned int, ++ unsigned int page_order = max_t(unsigned int, + ilog2(b->c->btree_pages), +- btree_order(k)), +- gfp)) { ++ btree_order(k)); ++ ++ if (!bch_btree_keys_alloc(&b->keys, page_order, gfp)) { ++ atomic_long_add(1 << page_order, ++ &b->c->btree_cache_total_pages); + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 26e374fbf57c..c67013b116a3 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1781,6 +1781,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); ++ atomic_long_set(&c->btree_cache_total_pages, 0); + INIT_LIST_HEAD(&c->data_buckets); + + iter_size = (sb->bucket_size / sb->block_size + 1) * +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 9f0826712845..f5065b30c994 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -629,14 +629,9 @@ static unsigned int bch_root_usage(struct cache_set *c) + + static size_t bch_cache_size(struct cache_set *c) + { +- size_t ret = 0; +- struct btree *b; +- +- mutex_lock(&c->bucket_lock); +- list_for_each_entry(b, &c->btree_cache, list) +- ret += 1 << (b->keys.page_order + PAGE_SHIFT); ++ size_t ret; + +- mutex_unlock(&c->bucket_lock); ++ ret = atomic_long_read(&c->btree_cache_total_pages) << PAGE_SHIFT; + return ret; + } + +-- +2.16.4 + diff --git a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch b/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch new file mode 100644 index 0000000..7ccd838 --- /dev/null +++ b/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch @@ -0,0 +1,28 @@ +From 779bada095ec02a9bd400bc0a46039c4ead6c00d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 2 Jul 2019 22:30:29 +0800 +Subject: [PATCH] bcache: only set b->accessed = 1 for dirty btree node cache + +--- + drivers/md/bcache/btree.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ba434d9ac720..1497f1114b10 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1058,7 +1058,10 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + BUG_ON(!b->written); + + b->parent = parent; +- b->accessed = 1; ++ ++ /* make clean btree node more easier to be reclaim */ ++ if (!write) ++ b->accessed = 1; + + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); +-- +2.16.4 + diff --git a/for-test/mca_limit/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch b/for-test/mca_limit/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch new file mode 100644 index 0000000..3f8238f --- /dev/null +++ b/for-test/mca_limit/0001-bcache-introduce-btree_cache_total_pages-into-struct.patch @@ -0,0 +1,110 @@ +From db1fb64ff29474b18e07a7a3887e326dd2b891b5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 11 Jul 2019 22:56:40 +0800 +Subject: [PATCH] bcache: introduce btree_cache_total_pages into struct + cache_set + +A new member "atomic_long_t btree_cache_total_pages" is added into +struct cache_set, to record total page numbers occupied by bcache +internal btree node cache. + +When mca_data_alloc() is called to allocate pages for btree node cache, +the allocated pages number is added to btree_cache_total_pages. When +mca_data_free() is called to free pages of a btree node cache, the +freed pages numbeer is subtracted from btree_cache_total_pages. + +Then in sysfs.c:bch_cache_size(), when calculating the total pages +occupied by bcache btree node cache, it is unncessary to iterate list +c->btree_cache, c->btree_cache_total_pages can be directly used now. + +Now reading /sys/fs/bcache/<cache set UUID>/btree_cache_size is faster, +and this patch is also a preparation to limit pages consumption by the +bcache internal btree node cache. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/btree.c | 12 ++++++++---- + drivers/md/bcache/super.c | 1 + + drivers/md/bcache/sysfs.c | 9 ++------- + 4 files changed, 12 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..fe5ff867725c 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -570,6 +570,7 @@ struct cache_set { + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; ++ atomic_long_t btree_cache_total_pages; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned int btree_cache_used; +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ba434d9ac720..df800e3e2dba 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -578,6 +578,8 @@ static void mca_data_free(struct btree *b) + + bch_btree_keys_free(&b->keys); + ++ atomic_long_sub(1<<b->keys.page_order, ++ &b->c->btree_cache_total_pages); + b->c->btree_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); + } +@@ -598,11 +600,13 @@ static unsigned int btree_order(struct bkey *k) + + static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) + { +- if (!bch_btree_keys_alloc(&b->keys, +- max_t(unsigned int, ++ unsigned int page_order = max_t(unsigned int, + ilog2(b->c->btree_pages), +- btree_order(k)), +- gfp)) { ++ btree_order(k)); ++ ++ if (!bch_btree_keys_alloc(&b->keys, page_order, gfp)) { ++ atomic_long_add(1 << page_order, ++ &b->c->btree_cache_total_pages); + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 26e374fbf57c..c67013b116a3 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1781,6 +1781,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); ++ atomic_long_set(&c->btree_cache_total_pages, 0); + INIT_LIST_HEAD(&c->data_buckets); + + iter_size = (sb->bucket_size / sb->block_size + 1) * +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 9f0826712845..f5065b30c994 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -629,14 +629,9 @@ static unsigned int bch_root_usage(struct cache_set *c) + + static size_t bch_cache_size(struct cache_set *c) + { +- size_t ret = 0; +- struct btree *b; +- +- mutex_lock(&c->bucket_lock); +- list_for_each_entry(b, &c->btree_cache, list) +- ret += 1 << (b->keys.page_order + PAGE_SHIFT); ++ size_t ret; + +- mutex_unlock(&c->bucket_lock); ++ ret = atomic_long_read(&c->btree_cache_total_pages) << PAGE_SHIFT; + return ret; + } + +-- +2.16.4 + diff --git a/for-test/mca_limit/0001-bcache-restrict-mca-pages-consumption.patch b/for-test/mca_limit/0001-bcache-restrict-mca-pages-consumption.patch new file mode 100644 index 0000000..7996bd4 --- /dev/null +++ b/for-test/mca_limit/0001-bcache-restrict-mca-pages-consumption.patch @@ -0,0 +1,149 @@ +From 82cfcce9f62cfb5c4a00d774525a33531a6e7091 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 23 Jul 2019 01:11:28 +0800 +Subject: [PATCH] bcache: restrict mca pages consumption + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 4 +-- + drivers/md/bcache/btree.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/btree.h | 6 +++++ + drivers/md/bcache/super.c | 2 +- + 4 files changed, 72 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index fe5ff867725c..c245b656a057 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -498,7 +498,7 @@ struct gc_stat { + #define CACHE_SET_STOPPING 1 + #define CACHE_SET_RUNNING 2 + #define CACHE_SET_IO_DISABLE 3 +- ++#define CACHE_SET_MCA_SHRINKING 4 + struct cache_set { + struct closure cl; + +@@ -571,9 +571,9 @@ struct cache_set { + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; + atomic_long_t btree_cache_total_pages; +- + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned int btree_cache_used; ++ struct work_struct btree_cache_shrink_work; + + /* + * If we need to allocate memory for a new btree node and that +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index df800e3e2dba..29d33d42096b 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -827,6 +827,69 @@ void bch_btree_cache_free(struct cache_set *c) + mutex_unlock(&c->bucket_lock); + } + ++static inline unsigned int get_mca_consume_percent(struct cache_set *c) ++{ ++ return (unsigned int) ++ (atomic_long_read(&c->btree_cache_total_pages) * 100) / ++ totalram_pages(); ++} ++ ++void bch_mca_shrink_work(struct work_struct *w) ++{ ++ unsigned int mca_consume_percent, shrink_target_percent; ++ unsigned int loop_nr = 0; ++ struct cache_set *c = ++ container_of(w, struct cache_set, btree_cache_shrink_work); ++ ++ if (c->shrinker_disabled) { ++ pr_info_ratelimited("shrinker_disabled set, no shrink"); ++ goto out; ++ } ++ ++ mca_consume_percent = get_mca_consume_percent(c); ++ ++ if (mca_consume_percent >= BCH_MCA_WMARK_HIGH) ++ shrink_target_percent = ++ BCH_MCA_WMARK_LOW - BCH_MCA_SHRINK_HYSTERESIS; ++ else if (mca_consume_percent >= BCH_MCA_WMARK_LOW) ++ shrink_target_percent = ++ mca_consume_percent - BCH_MCA_SHRINK_HYSTERESIS; ++ else { ++ pr_info("mca_consume_percent: %u%%, not shrink", ++ mca_consume_percent); ++ goto out; ++ } ++ ++ while (mca_consume_percent > shrink_target_percent) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = c->btree_cache_used * c->btree_pages / 10; ++ c->shrink.scan_objects(&c->shrink, &sc); ++ mca_consume_percent = get_mca_consume_percent(c); ++ loop_nr++; ++ } ++ ++ pr_info("loop_nr: %u for consum_percent(%u):target_percent(%u)", ++ loop_nr, mca_consume_percent, shrink_target_percent); ++ ++out: ++ if(!test_and_clear_bit(CACHE_SET_MCA_SHRINKING, &c->flags)) ++ WARN(1, "CACHE_SET_MCA_SHRINKING cleared already"); ++ ++ closure_put(&c->cl); ++} ++ ++/* Proactively shrink mca pages for low watermark */ ++void bch_mca_cache_shrink(struct cache_set *c) ++{ ++ if (test_and_set_bit(CACHE_SET_MCA_SHRINKING, &c->flags)) ++ return; ++ ++ closure_get(&c->cl); ++ queue_work(system_wq, &c->btree_cache_shrink_work); ++} ++ + int bch_btree_cache_alloc(struct cache_set *c) + { + unsigned int i; +diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h +index 76cfd121a486..665a111ad9ce 100644 +--- a/drivers/md/bcache/btree.h ++++ b/drivers/md/bcache/btree.h +@@ -102,6 +102,10 @@ + #include "bset.h" + #include "debug.h" + ++#define BCH_MCA_WMARK_HIGH 80 ++#define BCH_MCA_WMARK_LOW 60 ++#define BCH_MCA_SHRINK_HYSTERESIS 10 ++ + struct btree_write { + atomic_t *journal; + +@@ -331,4 +335,6 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct bkey *end, + keybuf_pred_fn *pred); + void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); ++void bch_mca_cache_shrink(struct cache_set *c); ++void bch_mca_shrink_work(struct work_struct *w); + #endif +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c67013b116a3..87c44e3475cd 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1783,7 +1783,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + INIT_LIST_HEAD(&c->btree_cache_freed); + atomic_long_set(&c->btree_cache_total_pages, 0); + INIT_LIST_HEAD(&c->data_buckets); +- ++ INIT_WORK(&c->btree_cache_shrink_work, bch_mca_shrink_work); + iter_size = (sb->bucket_size / sb->block_size + 1) * + sizeof(struct btree_iter_set); + +-- +2.16.4 + |