diff options
author | Coly Li <colyli@suse.de> | 2019-06-11 21:40:40 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2019-06-11 21:40:40 +0800 |
commit | f520123a62a6ff495d60cc22babd77c3685f3029 (patch) | |
tree | 45867ee465e0472e36e972f627549fffdc312f3f | |
parent | ab6809b961f320d7bc80af3b81a10fe20e894c54 (diff) | |
download | bcache-patches-f520123a62a6ff495d60cc22babd77c3685f3029.tar.gz |
for-test: remove out of date patches
28 files changed, 0 insertions, 3096 deletions
diff --git a/for-test/0001-bcache-add-w_data_avg.patch b/for-test/0001-bcache-add-w_data_avg.patch deleted file mode 100644 index 3b6e089..0000000 --- a/for-test/0001-bcache-add-w_data_avg.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 47e164ffcae5dc3e03bff72a0787652fa5aaf057 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 7 Dec 2018 23:52:39 +0800 -Subject: [PATCH] bcache: add w_data_avg - -To record average write size for journal w[]->data. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 8 +++++++- - drivers/md/bcache/journal.h | 1 + - drivers/md/bcache/sysfs.c | 4 ++++ - 3 files changed, 12 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 522c7426f3a0..569aa1484ac0 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -613,7 +613,7 @@ static void journal_write_unlocked(struct closure *cl) - struct bkey *k = &c->journal.key; - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * - c->sb.block_size; -- -+ int w_data_avg; - struct bio *bio; - struct bio_list list; - -@@ -671,6 +671,11 @@ static void journal_write_unlocked(struct closure *cl) - ca->journal.seq[ca->journal.cur_idx] = w->data->seq; - } - -+ /* record average size of written w->data in sectors */ -+ w_data_avg = atomic_read(&c->journal.w_data_avg); -+ w_data_avg = ewma_add(w_data_avg, sectors, 8, 4); -+ atomic_set(&c->journal.w_data_avg, w_data_avg); -+ - atomic_dec_bug(&fifo_back(&c->journal.pin)); - bch_journal_next(&c->journal); - journal_reclaim(c); -@@ -845,6 +850,7 @@ int bch_journal_alloc(struct cache_set *c) - - j->w[0].c = c; - j->w[1].c = c; -+ atomic_set(&j->w_data_avg, 0); - - if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 66f0facff84b..3be9d7f72d5a 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -117,6 +117,7 @@ struct journal { - BKEY_PADDED(key); - - struct journal_write w[2], *cur; -+ atomic_t w_data_avg; - }; - - /* -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 26f035a0c5b9..d3b56cd3b794 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -67,6 +67,7 @@ read_attribute(written); - read_attribute(btree_written); - read_attribute(metadata_written); - read_attribute(active_journal_entries); -+read_attribute(w_data_avg); - - sysfs_time_stats_attribute(btree_gc, sec, ms); - sysfs_time_stats_attribute(btree_split, sec, us); -@@ -669,6 +670,8 @@ SHOW(__bch_cache_set) - c->congested_write_threshold_us); - - sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); -+ sysfs_print(w_data_avg, -+ atomic_read(&c->journal.w_data_avg)); - sysfs_printf(verify, "%i", c->verify); - sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); - sysfs_printf(expensive_debug_checks, -@@ -841,6 +844,7 @@ KTYPE(bch_cache_set); - - static struct attribute *bch_cache_set_internal_files[] = { - &sysfs_active_journal_entries, -+ &sysfs_w_data_avg, - - sysfs_time_stats_attribute_list(btree_gc, sec, ms) - sysfs_time_stats_attribute_list(btree_split, sec, us) --- -2.16.4 - diff --git a/for-test/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch b/for-test/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch deleted file mode 100644 index ecb5500..0000000 --- a/for-test/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch +++ /dev/null @@ -1,107 +0,0 @@ -From fa0d3525fd1572c44f2568513670dc7742c62ccd Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 2 Jun 2019 00:36:18 +0800 -Subject: [PATCH 23/32] bcache: use bcache_mod_wq to replace system wide - system_wq - -to avoid blocking happens in bcache worker blocks other kernel -subsystem kworker (e.g. network). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 21 ++++++++++++++------- - 1 file changed, 14 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 97dbe3151a9c..915ff9365ec6 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -47,6 +47,7 @@ static LIST_HEAD(uncached_devices); - static int bcache_major; - static DEFINE_IDA(bcache_device_idx); - static wait_queue_head_t unregister_wait; -+struct workqueue_struct *bcache_mod_wq; - struct workqueue_struct *bcache_wq; - struct workqueue_struct *bch_journal_wq; - -@@ -1260,7 +1261,7 @@ static void cached_dev_flush(struct closure *cl) - bch_cache_accounting_destroy(&dc->accounting); - kobject_del(&d->kobj); - -- continue_at(cl, cached_dev_free, system_wq); -+ continue_at(cl, cached_dev_free, bcache_mod_wq); - } - - static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) -@@ -1272,7 +1273,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) - __module_get(THIS_MODULE); - INIT_LIST_HEAD(&dc->list); - closure_init(&dc->disk.cl, NULL); -- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); -+ set_closure_fn(&dc->disk.cl, cached_dev_flush, bcache_mod_wq); - kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); - INIT_WORK(&dc->detach, cached_dev_detach_finish); - sema_init(&dc->sb_write_mutex, 1); -@@ -1395,7 +1396,7 @@ static void flash_dev_flush(struct closure *cl) - bcache_device_unlink(d); - mutex_unlock(&bch_register_lock); - kobject_del(&d->kobj); -- continue_at(cl, flash_dev_free, system_wq); -+ continue_at(cl, flash_dev_free, bcache_mod_wq); - } - - static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) -@@ -1406,7 +1407,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) - return -ENOMEM; - - closure_init(&d->cl, NULL); -- set_closure_fn(&d->cl, flash_dev_flush, system_wq); -+ set_closure_fn(&d->cl, flash_dev_flush, bcache_mod_wq); - - kobject_init(&d->kobj, &bch_flash_dev_ktype); - -@@ -1714,7 +1715,7 @@ static void __cache_set_unregister(struct closure *cl) - - mutex_unlock(&bch_register_lock); - -- continue_at(cl, cache_set_flush, system_wq); -+ continue_at(cl, cache_set_flush, bcache_mod_wq); - } - - void bch_cache_set_stop(struct cache_set *c) -@@ -1743,10 +1744,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - - __module_get(THIS_MODULE); - closure_init(&c->cl, NULL); -- set_closure_fn(&c->cl, cache_set_free, system_wq); -+ set_closure_fn(&c->cl, cache_set_free, bcache_mod_wq); - - closure_init(&c->caching, &c->cl); -- set_closure_fn(&c->caching, __cache_set_unregister, system_wq); -+ set_closure_fn(&c->caching, __cache_set_unregister, bcache_mod_wq); - - /* Maybe create continue_at_noreturn() and use it here? */ - closure_set_stopped(&c->cl); -@@ -2583,6 +2584,8 @@ static void bcache_exit(void) - bch_request_exit(); - if (bcache_kobj) - kobject_put(bcache_kobj); -+ if (bcache_mod_wq) -+ destroy_workqueue(bcache_mod_wq); - if (bcache_wq) - destroy_workqueue(bcache_wq); - if (bch_journal_wq) -@@ -2642,6 +2645,10 @@ static int __init bcache_init(void) - return bcache_major; - } - -+ bcache_mod_wq = alloc_workqueue("bcache_mod_wq", WQ_MEM_RECLAIM, 0); -+ if (!bcache_mod_wq) -+ goto err; -+ - bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); - if (!bcache_wq) - goto err; --- -2.16.4 - diff --git a/for-test/0026-bcache-move-dc-io_disable-into-dc-flags.patch b/for-test/0026-bcache-move-dc-io_disable-into-dc-flags.patch deleted file mode 100644 index 705b89a..0000000 --- a/for-test/0026-bcache-move-dc-io_disable-into-dc-flags.patch +++ /dev/null @@ -1,170 +0,0 @@ -From 3153d5b784eb8a6008cbd7a6087d8eaf1e8f9fe8 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 2 Jun 2019 01:41:01 +0800 -Subject: [PATCH 26/32] bcache: move dc->io_disable into dc->flags - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 3 ++- - drivers/md/bcache/request.c | 4 ++-- - drivers/md/bcache/super.c | 36 ++++++++++++++++++++++-------------- - drivers/md/bcache/sysfs.c | 9 +++++++-- - 4 files changed, 33 insertions(+), 19 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 013e35a9e317..ccfc3b245462 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -362,7 +362,8 @@ struct cached_dev { - unsigned int sequential_cutoff; - unsigned int readahead; - -- unsigned int io_disable:1; -+#define CACHED_DEV_IO_DISABLED 0 -+ unsigned long flags; - unsigned int verify:1; - unsigned int bypass_torture_test:1; - -diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c -index 41adcd1546f1..4bdf5be04c0a 100644 ---- a/drivers/md/bcache/request.c -+++ b/drivers/md/bcache/request.c -@@ -1175,7 +1175,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, - int rw = bio_data_dir(bio); - - if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || -- dc->io_disable)) { -+ test_bit(CACHED_DEV_IO_DISABLED, &dc->flags))) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return BLK_QC_T_NONE; -@@ -1236,7 +1236,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, - { - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - -- if (dc->io_disable) -+ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) - return -EIO; - - return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index bf28a51dbdea..c219a1aeef02 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -888,10 +888,11 @@ static int cached_dev_status_update(void *arg) - - /* - * If this delayed worker is stopping outside, directly quit here. -- * dc->io_disable might be set via sysfs interface, so check it -- * here too. -+ * CACHED_DEV_IO_DISABLED might be set via sysfs interface, so check -+ * it here too. - */ -- while (!kthread_should_stop() && !dc->io_disable) { -+ while (!kthread_should_stop() && -+ !test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { - q = bdev_get_queue(dc->bdev); - if (blk_queue_dying(q)) - dc->offline_seconds++; -@@ -904,8 +905,11 @@ static int cached_dev_status_update(void *arg) - BACKING_DEV_OFFLINE_TIMEOUT); - pr_err("%s: disable I/O request due to backing " - "device offline", dc->disk.name); -- dc->io_disable = true; -- /* let others know earlier that io_disable is true */ -+ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); -+ /* -+ * let others know earlier that CACHED_DEV_IO_DISABLED -+ * is set. -+ */ - smp_mb(); - bcache_device_stop(&dc->disk); - break; -@@ -929,7 +933,7 @@ int bch_cached_dev_run(struct cached_dev *dc) - NULL, - }; - -- if (dc->io_disable) { -+ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { - pr_err("I/O disabled on cached dev %s", - dc->backing_dev_name); - return -EIO; -@@ -1305,7 +1309,11 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) - q->backing_dev_info->ra_pages); - - atomic_set(&dc->io_errors, 0); -- dc->io_disable = false; -+ /* -+ * Clear following bit position in dc->flags -+ * - CACHED_DEV_IO_DISABLED -+ */ -+ dc->flags = 0; - dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; - /* default to auto */ - dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; -@@ -1480,8 +1488,8 @@ bool bch_cached_dev_error(struct cached_dev *dc) - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) - return false; - -- dc->io_disable = true; -- /* make others know io_disable is true earlier */ -+ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); -+ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ - smp_mb(); - - pr_err("stop %s: too many IO errors on backing device %s\n", -@@ -1489,7 +1497,7 @@ bool bch_cached_dev_error(struct cached_dev *dc) - - /* - * If the cached device is still attached to a cache set, -- * even dc->io_disable is true and no more I/O requests -+ * even CACHED_DEV_IO_DISABLED is set and no more I/O requests - * accepted, cache device internal I/O (writeback scan or - * garbage collection) may still prevent bcache device from - * being stopped. So here CACHE_SET_IO_DISABLE should be -@@ -1672,11 +1680,11 @@ static void conditional_stop_bcache_device(struct cache_set *c, - * behavior may also introduce potential inconsistence - * data in writeback mode while cache is dirty. - * Therefore before calling bcache_device_stop() due -- * to a broken cache device, dc->io_disable should be -- * explicitly set to true. -+ * to a broken cache device, CACHED_DEV_IO_DISABLED should -+ * be explicitly set. - */ -- dc->io_disable = true; -- /* make others know io_disable is true earlier */ -+ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); -+ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ - smp_mb(); - bcache_device_stop(d); - } else { -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 4ab15442cab5..4bb1592270b1 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -180,7 +180,8 @@ SHOW(__bch_cached_dev) - wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); - sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); - sysfs_printf(io_error_limit, "%i", dc->error_limit); -- sysfs_printf(io_disable, "%i", dc->io_disable); -+ sysfs_printf(io_disable, "%i", -+ (int)test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)); - var_print(writeback_rate_update_seconds); - var_print(writeback_rate_i_term_inverse); - var_print(writeback_rate_p_term_inverse); -@@ -319,7 +320,11 @@ STORE(__cached_dev) - if (attr == &sysfs_io_disable) { - int v = strtoul_or_return(buf); - -- dc->io_disable = v ? 1 : 0; -+ if (v > 0) -+ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); -+ else -+ clear_bit(CACHED_DEV_IO_DISABLED, &dc->flags); -+ return size; - } - - sysfs_strtoul_clamp(sequential_cutoff, --- -2.16.4 - diff --git a/for-test/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch b/for-test/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch deleted file mode 100644 index 4897e85..0000000 --- a/for-test/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch +++ /dev/null @@ -1,104 +0,0 @@ -From b51fb8f54a265b7734d916016c20889a92ca0882 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 2 Jun 2019 18:55:09 +0800 -Subject: [PATCH 29/32] bcache: replace system_wq to bcache_mod_wq - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/bcache.h | 3 ++- - drivers/md/bcache/btree.c | 4 ++-- - drivers/md/bcache/journal.c | 2 +- - drivers/md/bcache/sysfs.c | 2 +- - drivers/md/bcache/writeback.c | 4 ++-- - 5 files changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index aae69060db7a..e7f0c42ab234 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -870,10 +870,11 @@ do { \ - for (b = (ca)->buckets + (ca)->sb.first_bucket; \ - b < (ca)->buckets + (ca)->sb.nbuckets; b++) - -+extern struct workqueue_struct *bcache_mod_wq; - static inline void cached_dev_put(struct cached_dev *dc) - { - if (refcount_dec_and_test(&dc->count)) -- schedule_work(&dc->detach); -+ queue_work(bcache_mod_wq, &dc->detach); - } - - static inline bool cached_dev_get(struct cached_dev *dc) -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index c0dd8fde37af..8325a2d11717 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -366,7 +366,7 @@ static void __btree_node_write_done(struct closure *cl) - btree_complete_write(b, w); - - if (btree_node_dirty(b)) -- schedule_delayed_work(&b->work, 30 * HZ); -+ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); - - closure_return_with_destructor(cl, btree_node_write_unlock); - } -@@ -539,7 +539,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) - BUG_ON(!i->keys); - - if (!btree_node_dirty(b)) -- schedule_delayed_work(&b->work, 30 * HZ); -+ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); - - set_btree_node_dirty(b); - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index ef4142c623fe..646e0386de4a 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -887,7 +887,7 @@ atomic_t *bch_journal(struct cache_set *c, - journal_try_write(c); - } else if (!w->dirty) { - w->dirty = true; -- schedule_delayed_work(&c->journal.work, -+ queue_delayed_work(bcache_mod_wq, &c->journal.work, - msecs_to_jiffies(c->journal_delay_ms)); - spin_unlock(&c->journal.lock); - } else { -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 4bb1592270b1..849146d539c9 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -447,7 +447,7 @@ STORE(bch_cached_dev) - - if (attr == &sysfs_writeback_percent) - if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -- schedule_delayed_work(&dc->writeback_rate_update, -+ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - mutex_unlock(&bch_register_lock); -diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c -index 73f0efac2b9f..54f68ae9d343 100644 ---- a/drivers/md/bcache/writeback.c -+++ b/drivers/md/bcache/writeback.c -@@ -212,7 +212,7 @@ static void update_writeback_rate(struct work_struct *work) - */ - if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && - !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { -- schedule_delayed_work(&dc->writeback_rate_update, -+ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } - -@@ -835,7 +835,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) - dc->writeback_running = true; - - WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); -- schedule_delayed_work(&dc->writeback_rate_update, -+ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - - bch_writeback_queue(dc); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0000-cover-letter.patch b/for-test/jouranl-deadlock/v1/v1-0000-cover-letter.patch deleted file mode 100644 index d2e85ad..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0000-cover-letter.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 60b326d839c8df0528c9567db590173a8d11060b Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 2 Mar 2019 21:22:28 +0800 -Subject: [RFC PATCH v1 0/6] bcache: fix journal no-space deadlock - -Hi folks, - -Sorry for taking such a long time to compose a fix patch set for bcache -journal deadlock issue since the first time it was reported. - -The initial issue was know as several kthreads or kworkers were reported -by kernel to hang for quite long time. The reason was a deadlock happened -when there is no more journal space avialable for new coming journal -request. - -Finally it turns out to be two conditions that the jouranl no-space -deadlock may happen, one is in jouranl replay time, one is in normal -journal runtime. - -During my testing, I find deadlock still exists with my fixes, after -a lot of effort other three related bugs are explored and fixed. - -Now the patch set is testing on two machines for 3+ hours, the journal -deadlock does not appear yet. In my previous testings, it may show up -within 30 minutes on my machine. (In order to make the jouranl space to -be more easier exhuasted, I change SB_JOURNAL_BUCKETS from 256U to 3U -both in kernel and bcache-tools code). - -More testings on more machines will start soon, and the patches are not -stable enough for production environment usage. Bbut I think it is time -to post the patch set for your review and comments. - -I will continue to improve the fixes, e.g. remove some BUG_ON() once -I am sure they won't happen indeed. If you may help to test the patch -set, that will be really cool. - -Hope we may make this patch set stable soon. - -Thanks in advance for your help. - -Coly Li ---- - -Coly Li (6): - bcache: acquire c->journal.lock in bch_btree_leaf_dirty() - bcache: move definition of 'int ret' out of macro read_bucket() - bcache: never set 0 to KEY_PTRS of jouranl key in journal_reclaim() - bcache: reload jouranl key information during journal replay - bcache: fix journal deadlock during jouranl replay - bcache: reserve space for journal_meta() in run time - - drivers/md/bcache/btree.c | 2 + - drivers/md/bcache/journal.c | 244 +++++++++++++++++++++++++++++++++++++++++--- - drivers/md/bcache/journal.h | 5 + - 3 files changed, 238 insertions(+), 13 deletions(-) - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0001-bcache-acquire-c-journal.lock-in-bch_btree_leaf_d.patch b/for-test/jouranl-deadlock/v1/v1-0001-bcache-acquire-c-journal.lock-in-bch_btree_leaf_d.patch deleted file mode 100644 index 12c7da4..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0001-bcache-acquire-c-journal.lock-in-bch_btree_leaf_d.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 3c7e66546d18ead01bd821fa07f3ca2c73a9d964 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 2 Mar 2019 18:19:08 +0800 -Subject: [RFC PATCH v1 1/6] bcache: acquire c->journal.lock in - bch_btree_leaf_dirty() - -In bch_btree_leaf_dirty() when increase bcache journal pin counter, -current code uses atomic_inc(w->journal) directly. This is problematic -indeed, which may cause following code in journal.c:journal_reclaim() -not work properly, - 610 while (!atomic_read(&fifo_front(&c->journal.pin))) - 611 fifo_pop(&c->journal.pin, p); - -The above code piece is protected by spinlock c->journal.lock, and -the atomic counter w->journal in btree.c:bch_btree_leaf_dirty() is one -of the nodes from c->journal.pin. If the above while() loop just happens -to reach a fifo node which is w->journal in bch_btree_leaf_dirty(), -it is possible that the between line 610 and 611 the counter w->journal -is increased but poped off in journal_reclaim(). Then the journal jset -which w->journal referenced in bch_btree_leaf_dirty() gets lost. - -If system crashes or reboots before bkeys of the lost jset flushing back -to bcache btree node, journal_replay() after the reboot may complains -some journal entries lost and fail to register cache set. - -Such race condition is very rare to happen, I observe such issue when -I modify the journal buckets number to 3, which makes only a limited -number of jset being available. Then it is possible to observe journal -replay failure due to lost journal jset(s). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/btree.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index 23cb1dc7296b..ac1b9159402e 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -551,7 +551,9 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) - - if (!w->journal) { - w->journal = journal_ref; -+ spin_lock(&b->c->journal.lock); - atomic_inc(w->journal); -+ spin_unlock(&b->c->journal.lock); - } - } - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0002-bcache-move-definition-of-int-ret-out-of-macro-re.patch b/for-test/jouranl-deadlock/v1/v1-0002-bcache-move-definition-of-int-ret-out-of-macro-re.patch deleted file mode 100644 index 4ddb4fa..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0002-bcache-move-definition-of-int-ret-out-of-macro-re.patch +++ /dev/null @@ -1,50 +0,0 @@ -From da41d81e0abd211d2990d848cd33744ff335cd43 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 18:10:48 +0800 -Subject: [RFC PATCH v1 2/6] bcache: move definition of 'int ret' out of macro - read_bucket() - -'int ret' is defined as a local variable inside macro read_bucket(). -Since this macro is called multiple times, and following patches will -use a 'int ret' variable in bch_journal_read(), this patch moves -definition of 'int ret' from macro read_bucket() to range of function -bch_journal_read(). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index b2fd412715b1..6e18057d1d82 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - { - #define read_bucket(b) \ - ({ \ -- int ret = journal_read_bucket(ca, list, b); \ -+ ret = journal_read_bucket(ca, list, b); \ - __set_bit(b, bitmap); \ - if (ret < 0) \ - return ret; \ -@@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - - struct cache *ca; - unsigned int iter; -+ int ret = 0; - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; -@@ -267,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - struct journal_replay, - list)->j.seq; - -- return 0; -+ return ret; - #undef read_bucket - } - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0003-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch b/for-test/jouranl-deadlock/v1/v1-0003-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch deleted file mode 100644 index 1e1e476..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0003-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch +++ /dev/null @@ -1,93 +0,0 @@ -From d8c81f7cdb63bc3a2b00a8a9e5e9b4783e42c702 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:22:23 +0800 -Subject: [RFC PATCH v1 3/6] bcache: never set 0 to KEY_PTRS of jouranl key in - journal_reclaim() - -In journal_reclaim() ja->cur_idx of each cache will be update to -reclaim available journal buckets. Variable 'int n' is used to count how -many cache is successfully reclaimed, then n is set to c->journal.key -by SET_KEY_PTRS(). Later in journal_write_unlocked(), a for_each_cache() -loop will write the jset data onto each cache. - -The problem is, if all jouranl buckets on each cache is full, the -following code in journal_reclaim(), - -529 for_each_cache(ca, c, iter) { -530 struct journal_device *ja = &ca->journal; -531 unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; -532 -533 /* No space available on this device */ -534 if (next == ja->discard_idx) -535 continue; -536 -537 ja->cur_idx = next; -538 k->ptr[n++] = MAKE_PTR(0, -539 bucket_to_sector(c, ca->sb.d[ja->cur_idx]), -540 ca->sb.nr_this_dev); -541 } -542 -543 bkey_init(k); -544 SET_KEY_PTRS(k, n); - -If there is no available bucket to reclaim, the if() condition at line -534 will always true, and n remains 0. Then at line 544, SET_KEY_PTRS() -will set 0 to KEY_PTRS field of c->journal.key. - -Setting KEY_PTRS field of c->journal.key to 0 is wrong. Because in -journal_write_unlocked() the journal data is written in following loop, - -649 for (i = 0; i < KEY_PTRS(k); i++) { -650-671 submit journal data to cache device -672 } - -If KEY_PTRS field is set to 0 in jouranl_reclaim(), the journal data -won't be written to cache device here. If system crahed or rebooted -before bkeys of the lost journal entries written into btree nodes, data -corruption will be reported during bcache reload after rebooting the -system. - -Indeed there is only one cache in a cache set, there is no need to set -KEY_PTRS field in journal_reclaim() at all. But in order to keep the -for_each_cache() logic consistent for now, this patch fixes the above -problem by not setting 0 KEY_PTRS of journal key, if there is no bucket -available to reclaim. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 11 +++++++---- - 1 file changed, 7 insertions(+), 4 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 6e18057d1d82..5180bed911ef 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -541,11 +541,11 @@ static void journal_reclaim(struct cache_set *c) - ca->sb.nr_this_dev); - } - -- bkey_init(k); -- SET_KEY_PTRS(k, n); -- -- if (n) -+ if (n) { -+ bkey_init(k); -+ SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; -+ } - out: - if (!journal_full(&c->journal)) - __closure_wake_up(&c->journal.wait); -@@ -672,6 +672,9 @@ static void journal_write_unlocked(struct closure *cl) - ca->journal.seq[ca->journal.cur_idx] = w->data->seq; - } - -+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */ -+ BUG_ON(i == 0); -+ - atomic_dec_bug(&fifo_back(&c->journal.pin)); - bch_journal_next(&c->journal); - journal_reclaim(c); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0004-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v1/v1-0004-bcache-reload-jouranl-key-information-during-jour.patch deleted file mode 100644 index 7877cfa..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0004-bcache-reload-jouranl-key-information-during-jour.patch +++ /dev/null @@ -1,160 +0,0 @@ -From 1ff320546f894a6067c6a73bfaa937fca20308de Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:32:22 +0800 -Subject: [RFC PATCH v1 4/6] bcache: reload jouranl key information during - journal replay - -When bcache journal initiates during running cache set, cache set -journal.blocks_free is initiated as 0. Then during journal replay if -journal_meta() is called and an empty jset is written to cache device, -journal_reclaim() is called. If there is available journal bucket to -reclaim, c->journal.blocks_free is set to numbers of blocks of a journal -bucket, which is c->sb.bucket_size >> c->block_bits. - -Most of time the above process works correctly, expect the condtion -when journal space is almost full. "Almost full" means there is no free -journal bucket, but there are still free blocks in last available -bucket indexed by ja->cur_idx. - -If system crashes or reboots when journal space is almost full, problem -comes. During cache set reload after the reboot, c->journal.blocks_free -is initialized as 0, when jouranl replay process writes bcache jouranl, -journal_reclaim() will be called to reclaim available journal bucket and -set c->journal.blocks_free to c->sb.bucket_size >> c->block_bits. But -there is no fully free bucket to reclaim in journal_reclaim(), so value -of c->journal.blocks_free will keep 0. If the first journal entry -processed by journal_replay() causes btree split and requires writing -journal space by journal_meta(), journal_meta() has to go into an -infinite loop to reclaim jouranl bucket, and blocks the whole cache set -to run. - -Such buggy situation can be solved if we do following things before -journal replay starts, -- Recover previous value of c->journal.blocks_free in last run time, - and set it to current c->journal.blocks_free as initial value. -- Recover previous value of ja->cur_idx in last run time, and set it to - KEY_PTR of current c->journal.key as initial value. - -After c->journal.blocks_free and c->journal.key are recovered, in -condition when jouranl space is almost full and cache set is reloaded, -meta journal entry from journal reply can be written into free blocks of -the last available journal bucket, then old jouranl entries can be -replayed and reclaimed for further journaling request. - -This patch adds bch_journal_key_reload() to recover journal blocks_free -and key ptr value for above purpose. bch_journal_key_reload() is called -in bch_journal_read() before replying journal by bch_journal_replay(). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 87 insertions(+) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 5180bed911ef..a6deb16c15c8 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -143,6 +143,89 @@ reread: left = ca->sb.bucket_size - offset; - return ret; - } - -+static int bch_journal_key_reload(struct cache_set *c) -+{ -+ struct cache *ca; -+ unsigned int iter, n = 0; -+ struct bkey *k = &c->journal.key; -+ int ret = 0; -+ -+ for_each_cache(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ struct bio *bio = &ja->bio; -+ struct jset *j, *data = c->journal.w[0].data; -+ struct closure cl; -+ unsigned int len, left; -+ unsigned int offset = 0, used_blocks = 0; -+ sector_t bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]); -+ -+ closure_init_stack(&cl); -+ -+ while (offset < ca->sb.bucket_size) { -+reread: left = ca->sb.bucket_size - offset; -+ len = min_t(unsigned int, -+ left, PAGE_SECTORS << JSET_BITS); -+ -+ bio_reset(bio); -+ bio->bi_iter.bi_sector = bucket + offset; -+ bio_set_dev(bio, ca->bdev); -+ bio->bi_iter.bi_size = len << 9; -+ -+ bio->bi_end_io = journal_read_endio; -+ bio->bi_private = &cl; -+ bio_set_op_attrs(bio, REQ_OP_READ, 0); -+ bch_bio_map(bio, data); -+ -+ closure_bio_submit(c, bio, &cl); -+ closure_sync(&cl); -+ -+ j = data; -+ while (len) { -+ size_t blocks, bytes = set_bytes(j); -+ -+ if (j->magic != jset_magic(&ca->sb)) -+ goto out; -+ -+ if (bytes > left << 9 || -+ bytes > PAGE_SIZE << JSET_BITS) { -+ pr_err("jset may be correpted: too big"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (bytes > len << 9) -+ goto reread; -+ -+ if (j->csum != csum_set(j)) { -+ pr_err("jset may be corrupted: bad csum"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ blocks = set_blocks(j, block_bytes(c)); -+ used_blocks += blocks; -+ -+ offset += blocks * ca->sb.block_size; -+ len -= blocks * ca->sb.block_size; -+ j = ((void *) j) + blocks * block_bytes(ca); -+ } -+ } -+out: -+ c->journal.blocks_free = -+ (c->sb.bucket_size >> c->block_bits) - -+ used_blocks; -+ -+ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev); -+ } -+ -+ BUG_ON(n == 0); -+ bkey_init(k); -+ SET_KEY_PTRS(k, n); -+ -+err: -+ return ret; -+} -+ - int bch_journal_read(struct cache_set *c, struct list_head *list) - { - #define read_bucket(b) \ -@@ -268,6 +351,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - struct journal_replay, - list)->j.seq; - -+ /* Initial value of c->journal.blocks_free should be 0 */ -+ BUG_ON(c->journal.blocks_free != 0); -+ ret = bch_journal_key_reload(c); -+ - return ret; - #undef read_bucket - } --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0005-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v1/v1-0005-bcache-fix-journal-deadlock-during-jouranl-replay.patch deleted file mode 100644 index ba2a763..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0005-bcache-fix-journal-deadlock-during-jouranl-replay.patch +++ /dev/null @@ -1,275 +0,0 @@ -From ee8cbff3518dcaf67c16cff0cefe2a4424573bff Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:35:02 +0800 -Subject: [RFC PATCH v1 5/6] bcache: fix journal deadlock during jouranl replay - -A deadlock of bcache jouranling may happen during journal replay. Such -deadlock happens when, -- Journal space is totally full (no any free blocks) and system crashes - or reboots. -- After reboot, the first journal entry handled by jouranl replay causes - btree split and jouranl_meta() is called to write an empty jset to - journal space. -- There is no journal space to write and journal_reclaim() fails to get - any available bucket because this is the first replayed journal entry - to be blocked. -Then the whole cache set is blocked from running. - -This patch is an effort to fix such journal replay deadlock in a simpler -way, -- Add a bool varialbe 'in_replay' in struct journal, set it to true when - journal replay starts, and set it to false when journal replay - completed. in_replay is initialized to be false. -- Reserve 6 sectors in journal bucket, do not use them in normal bcache - runtime. These sectors are only permitted to use during journal - replay (when c->jouranl.in_replay is true) - -Then in normal bcache runtime, journal space won't be totally full and -there are 6 sectors are always reserved for journal replay time. After -system reboots, if bch_btree_insert() in bch_journal_replay() causes -btree split and bch_journal_beta() gets called to require 1 sector -from journal buckets to write an empty jset, there are enough reserved -space to serve. - -The reason to reserve 6 sectors is, we should choose a number that won't -fix into a bucket size. If the reserved space happens to be a whole -bucket, more logic has to be added in journal_replay() to handle -journal.blocks_free with reserved spaces in journal replay time. This is -why 6 sectors is choosed, it is 3KB and won't be any proper block size -or bucket size. - -The bcache btree node size is quite large, so btree node split won't be -a frequent event. And when btree node split happens, new added key will -be insert directly into uppper level or neighbor nodes and won't go into -journal again, only bch_journal_meta() is called to write jset metadata -which occupies 1 block in journal space. If blocksize is set to 4K size, -reserve 6 sectors indeed is 2 blocks, so there can be two continuously -btree splitting happen during journal replay, this is very very rare in -practice. As default blocksize is set to sector size, that equals to -6 blocks reserved. Contiously splitting the btree for 6 times in journal -replay is almost impossible, so the reserved space seems to be enough -in my humble opinion. - -If in future the reserved space turns out to be not enough, let's extend -it then. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 100 ++++++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/journal.h | 4 ++ - 2 files changed, 97 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a6deb16c15c8..c60a702f53a9 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -415,6 +415,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - uint64_t start = i->j.last_seq, end = i->j.seq, n = start; - struct keylist keylist; - -+ s->journal.in_replay = true; -+ - list_for_each_entry(i, list, list) { - BUG_ON(i->pin && atomic_read(i->pin) != 1); - -@@ -448,6 +450,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - pr_info("journal replay done, %i keys in %i entries, seq %llu", - keys, entries, end); - err: -+ s->journal.in_replay = false; - while (!list_empty(list)) { - i = list_first_entry(list, struct journal_replay, list); - list_del(&i->list); -@@ -577,6 +580,22 @@ static void do_journal_discard(struct cache *ca) - } - } - -+static inline bool last_available_journal_bucket(struct cache_set *c) -+{ -+ struct cache *ca; -+ unsigned int iter; -+ struct journal_device *ja; -+ -+ for_each_cache(ca, c, iter) { -+ ja = &ca->journal; -+ if (unlikely((ja->cur_idx + 1) % ca->sb.njournal_buckets == -+ ja->last_idx)) -+ return true; -+ } -+ -+ return false; -+} -+ - static void journal_reclaim(struct cache_set *c) - { - struct bkey *k = &c->journal.key; -@@ -584,6 +603,7 @@ static void journal_reclaim(struct cache_set *c) - uint64_t last_seq; - unsigned int iter, n = 0; - atomic_t p __maybe_unused; -+ bool last, do_wakeup = false; - - atomic_long_inc(&c->reclaim); - -@@ -606,8 +626,13 @@ static void journal_reclaim(struct cache_set *c) - for_each_cache(ca, c, iter) - do_journal_discard(ca); - -- if (c->journal.blocks_free) -+ last = last_available_journal_bucket(c); -+ if ((!last && c->journal.blocks_free) || -+ (last && (c->journal.blocks_free * c->sb.block_size) > -+ BCH_JOURNAL_RPLY_RESERVE)) { -+ do_wakeup = true; - goto out; -+ } - - /* - * Allocate: -@@ -632,9 +657,10 @@ static void journal_reclaim(struct cache_set *c) - bkey_init(k); - SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; -+ do_wakeup = true; - } - out: -- if (!journal_full(&c->journal)) -+ if (do_wakeup && !journal_full(&c->journal)) - __closure_wake_up(&c->journal.wait); - } - -@@ -692,6 +718,21 @@ static void journal_write_unlock(struct closure *cl) - spin_unlock(&c->journal.lock); - } - -+static bool should_reclaim(struct cache_set *c, -+ struct journal_write *w) -+{ -+ if (unlikely(journal_full(&c->journal))) -+ return true; -+ -+ if (unlikely(last_available_journal_bucket(c) && -+ (!c->journal.in_replay) && -+ (c->journal.blocks_free * c->sb.block_size <= -+ BCH_JOURNAL_RPLY_RESERVE))) -+ return true; -+ -+ return false; -+} -+ - static void journal_write_unlocked(struct closure *cl) - __releases(c->journal.lock) - { -@@ -710,7 +751,7 @@ static void journal_write_unlocked(struct closure *cl) - if (!w->need_write) { - closure_return_with_destructor(cl, journal_write_unlock); - return; -- } else if (journal_full(&c->journal)) { -+ } else if (should_reclaim(c, w)) { - journal_reclaim(c); - spin_unlock(&c->journal.lock); - -@@ -798,6 +839,52 @@ static void journal_try_write(struct cache_set *c) - } - } - -+static bool no_journal_wait(struct cache_set *c, -+ size_t sectors) -+{ -+ bool last = last_available_journal_bucket(c); -+ size_t reserved_sectors = 0; -+ size_t n = min_t(size_t, -+ c->journal.blocks_free * c->sb.block_size, -+ PAGE_SECTORS << JSET_BITS); -+ -+ if (last && !c->journal.in_replay) -+ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ -+ if (sectors <= (n - reserved_sectors)) -+ return true; -+ -+ return false; -+} -+ -+static bool should_try_write(struct cache_set *c, -+ struct journal_write *w) -+{ -+ size_t reserved_sectors, n, sectors; -+ -+ if (journal_full(&c->journal)) -+ return false; -+ -+ if (!last_available_journal_bucket(c)) -+ return true; -+ -+ /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ -+ if (w->data->keys == 0) -+ return false; -+ -+ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ n = min_t(size_t, -+ (c->journal.blocks_free * c->sb.block_size), -+ PAGE_SECTORS << JSET_BITS); -+ sectors = __set_blocks(w->data, w->data->keys, -+ block_bytes(c)) * c->sb.block_size; -+ if (sectors <= (n - reserved_sectors)) -+ return true; -+ -+ return false; -+} -+ -+ - static struct journal_write *journal_wait_for_write(struct cache_set *c, - unsigned int nkeys) - __acquires(&c->journal.lock) -@@ -816,15 +903,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, - sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; - -- if (sectors <= min_t(size_t, -- c->journal.blocks_free * c->sb.block_size, -- PAGE_SECTORS << JSET_BITS)) -+ if (no_journal_wait(c, sectors)) - return w; - - if (wait) - closure_wait(&c->journal.wait, &cl); - -- if (!journal_full(&c->journal)) { -+ if (should_try_write(c, w)) { - if (wait) - trace_bcache_journal_entry_full(c); - -@@ -933,6 +1018,7 @@ int bch_journal_alloc(struct cache_set *c) - INIT_DELAYED_WORK(&j->work, journal_write_work); - - c->journal_delay_ms = 100; -+ j->in_replay = false; - - j->w[0].c = c; - j->w[1].c = c; -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 66f0facff84b..54408e248a39 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -108,6 +108,7 @@ struct journal { - struct closure io; - int io_in_flight; - struct delayed_work work; -+ bool in_replay; - - /* Number of blocks free in the bucket(s) we're currently writing to */ - unsigned int blocks_free; -@@ -159,6 +160,9 @@ struct journal_device { - - #define JOURNAL_PIN 20000 - -+/* Reserved jouranl space in sectors */ -+#define BCH_JOURNAL_RPLY_RESERVE 6U -+ - #define journal_full(j) \ - (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v1/v1-0006-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v1/v1-0006-bcache-reserve-space-for-journal_meta-in-run-time.patch deleted file mode 100644 index 04d0a83..0000000 --- a/for-test/jouranl-deadlock/v1/v1-0006-bcache-reserve-space-for-journal_meta-in-run-time.patch +++ /dev/null @@ -1,241 +0,0 @@ -From 60b326d839c8df0528c9567db590173a8d11060b Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 28 Feb 2019 20:29:00 +0800 -Subject: [RFC PATCH v1 6/6] bcache: reserve space for journal_meta() in run - time - -Another journal deadlock of bcache jouranling can happen in normal -bcache runtime. It is very rare to happen but there are people report -bkey insert work queue blocked which caused by such deadlock. - -This is how such jouranling deadlock in runtime happens, -- Journal space is totally full and no free space to reclaim, jouranling - tasks waiting for space to write in journal_wait_for_write(). -- In order to have free journal space, btree_flush_write() is called to - flush earlest journaled in-memory btree key into btree node. Then all - journaled bkey in early used journal buckets are flushed to on-disk - btree, this journal bucket can be reclaimed for new coming jouranl - request. -- But if the earlest jouranled bkey causes a btree node split during - insert it into btree node, finally journal_meta() will be called to - journal btree root (and other information) into the journal space. -- Unfortunately the journal space is full, and the jouranl entries has - to be flushed in linear turn. So bch_journal_meta() from bkey insert - is blocked too. -Then jouranling deadlock during bcache run time happens. - -A method to fix such deadlock is to reserve some journal space too. The -reserved space can only be used when, -- Current journal bucket is the last journal bucket which has available - space to write into. -- When calling bch_journal(), current jset is empty and there is no key - in the inserting key list. This means the journal request if from - bch_journal_meta() and no non-reserved space can be used. - -Then if such journaling request is from bch_journal_meta() of inserting -the earlest journaled bkey back into btree, the deadlock condition won't -happen any more because the reserved space can be used for such -scenario. - -Since there are already 6 sectors reserved for journal replay, here we -reserve 7 sectors for runtime meta journal from btree split caused by -flushing journal entries back to btree node. Depends on block size from -1 sector to 4KB, the reserved space can serve for form 7 to 2 journal -blocks. Indeed only one journal block reserved for such journal deadlock -scenario is enough, 2 continuous btree splits cause by two adjoin bkey -flushing from journal is very very rare to happen. So reserve 7 sectors -should works. - -Another reason for reserving 7 sectors is, there are already 6 sectors -reserved fo journal repley, so in total there are 13 sectors reserved in -last available journal bucket. 13 sectors won't be a proper bucket size, -so we don't need to add more code to handle journal.blocks_free -initialization for whole reserved jouranl bucket. Even such code logic -is simple, less code is better in my humble opinion. - -Again, if in future the reserved space turns out to be not enough, let's -extend it then. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 89 +++++++++++++++++++++++++++++++++------------ - drivers/md/bcache/journal.h | 1 + - 2 files changed, 66 insertions(+), 24 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index c60a702f53a9..6aa68ab7cd78 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -629,7 +629,7 @@ static void journal_reclaim(struct cache_set *c) - last = last_available_journal_bucket(c); - if ((!last && c->journal.blocks_free) || - (last && (c->journal.blocks_free * c->sb.block_size) > -- BCH_JOURNAL_RPLY_RESERVE)) { -+ (BCH_JOURNAL_RESERVE + BCH_JOURNAL_RPLY_RESERVE))) { - do_wakeup = true; - goto out; - } -@@ -718,18 +718,27 @@ static void journal_write_unlock(struct closure *cl) - spin_unlock(&c->journal.lock); - } - --static bool should_reclaim(struct cache_set *c, -- struct journal_write *w) -+static inline bool should_reclaim(struct cache_set *c, -+ struct journal_write *w) - { -- if (unlikely(journal_full(&c->journal))) -- return true; -+ bool last = last_available_journal_bucket(c); - -- if (unlikely(last_available_journal_bucket(c) && -- (!c->journal.in_replay) && -- (c->journal.blocks_free * c->sb.block_size <= -- BCH_JOURNAL_RPLY_RESERVE))) -+ if (!last && journal_full(&c->journal)) - return true; - -+ if (unlikely(last)) { -+ size_t n = c->journal.blocks_free * c->sb.block_size; -+ -+ if (!c->journal.in_replay) { -+ if (n <= BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE) -+ return true; -+ } else { -+ if (n <= BCH_JOURNAL_RPLY_RESERVE) -+ return true; -+ } -+ } -+ - return false; - } - -@@ -751,7 +760,9 @@ static void journal_write_unlocked(struct closure *cl) - if (!w->need_write) { - closure_return_with_destructor(cl, journal_write_unlock); - return; -- } else if (should_reclaim(c, w)) { -+ } -+ -+ if (should_reclaim(c, w)) { - journal_reclaim(c); - spin_unlock(&c->journal.lock); - -@@ -840,16 +851,26 @@ static void journal_try_write(struct cache_set *c) - } - - static bool no_journal_wait(struct cache_set *c, -- size_t sectors) -+ size_t sectors, -+ int nkeys) - { -+ bool is_journal_meta = (nkeys == 0) ? true : false; - bool last = last_available_journal_bucket(c); - size_t reserved_sectors = 0; -- size_t n = min_t(size_t, -- c->journal.blocks_free * c->sb.block_size, -- PAGE_SECTORS << JSET_BITS); -+ size_t n; -+ -+ if (unlikely(last)) { -+ if (!is_journal_meta) -+ reserved_sectors = BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE; -+ else -+ reserved_sectors = (!c->journal.in_replay) ? -+ BCH_JOURNAL_RPLY_RESERVE : 0; -+ } - -- if (last && !c->journal.in_replay) -- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ n = min_t(size_t, -+ c->journal.blocks_free * c->sb.block_size, -+ PAGE_SECTORS << JSET_BITS); - - if (sectors <= (n - reserved_sectors)) - return true; -@@ -858,26 +879,46 @@ static bool no_journal_wait(struct cache_set *c, - } - - static bool should_try_write(struct cache_set *c, -- struct journal_write *w) -+ struct journal_write *w, -+ int nkeys) - { - size_t reserved_sectors, n, sectors; -+ bool last, empty_jset; - - if (journal_full(&c->journal)) - return false; - -- if (!last_available_journal_bucket(c)) -+ last = last_available_journal_bucket(c); -+ empty_jset = (w->data->keys == 0) ? true : false; -+ -+ if (!last) { -+ /* -+ * Not last available journal bucket, no reserved journal -+ * space restriction, an empty jset should not be here. -+ */ -+ BUG_ON(empty_jset); - return true; -+ } - -- /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ -- if (w->data->keys == 0) -+ if (empty_jset) { -+ /* -+ * If nkeys is 0 it means the journaling request is for meta -+ * data, which should be returned in journal_wait_for_write() -+ * by checking no_journal_wait(), and won't get here. -+ */ -+ BUG_ON(nkeys == 0); - return false; -+ } - -- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ reserved_sectors = BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE; - n = min_t(size_t, - (c->journal.blocks_free * c->sb.block_size), - PAGE_SECTORS << JSET_BITS); -- sectors = __set_blocks(w->data, w->data->keys, -+ sectors = __set_blocks(w->data, -+ w->data->keys, - block_bytes(c)) * c->sb.block_size; -+ - if (sectors <= (n - reserved_sectors)) - return true; - -@@ -903,13 +944,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, - sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; - -- if (no_journal_wait(c, sectors)) -+ if (no_journal_wait(c, sectors, nkeys)) - return w; - - if (wait) - closure_wait(&c->journal.wait, &cl); - -- if (should_try_write(c, w)) { -+ if (should_try_write(c, w, nkeys)) { - if (wait) - trace_bcache_journal_entry_full(c); - -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 54408e248a39..55f81443f304 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -162,6 +162,7 @@ struct journal_device { - - /* Reserved jouranl space in sectors */ - #define BCH_JOURNAL_RPLY_RESERVE 6U -+#define BCH_JOURNAL_RESERVE 7U - - #define journal_full(j) \ - (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch b/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch deleted file mode 100644 index 19d3c21..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 24539bb78565d784ddabb81f24968c13835eb000 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Fri, 19 Apr 2019 00:37:27 +0800 -Subject: [RFC PATCH v2 00/16] bcache: fix journal no-space deadlock - -The initial journal no-space deadlock issue was known as several -kthreads or kworkers were reported by kernel to hang for quite long -time. The reason was a deadlock happened when there is no more journal -space avialable for new coming journal request. - -In v1 RFC series, I though the journal no-space deadlock was from two -conditions, which was not the truth. After long time testing and -debugging, I realize the journal deadlock was a result of a series of -problems hidden in current code. - -Now I make progress in v2 series, and all known problems related to the -journal no-space deadlock are fixed. I don't observe journal deadlock -and related I/O hang warning any more. - -Unfortunately we can not apply this whole series at this moment, because -after fixing the journal no-space deadlock issue, I find a race in dirty -btree node flushing. Beside normal dirty btree node flushing, when there -is no journal space, btree_flush_write() will be called to write down -the oldest dirty btree node. Once the oldest dirty btree node is written -from memory into cache device, its associated journal reference will be -released, this operation is necessary to reclaim oldest busy journal -bucket when no-space in journal buckets. - -The problem of this race is, when building c->flush_btree heap, all -dirty btree node from for_each_cached_btree() are not protected or -referenced, so there is a race that after the heap c->flush_btree is -built and before the oldest node is selected from the heap, the oldest -node is already written in normal code path, and the memory is -released/reused. - -From my testing, a kernel panic triggered by wild pointer deference or -un-paired mutex_lock/unlock can be observed from btree_flush_write(), -this is because the selected btree node was written and released -already, btree_flush_write() just references invalid memory object. - -So far I don't have good idea to fix such race without hurting I/O -performance, and IMHO the bcache I/O hang by journal is kind of better -than kenrel panic. Therefore before the race of dirty btree nodes -writting gets fixed, I won't apply the whole series. - -But there are still some helpful and non-major fixes which can go into -upstream, to reduce the whole patch set and avoid huge changes in a -single kernel merge. - -The patch 'bcache: acquire c->journal.lock in bch_btree_leaf_dirty()` in -v1 series was removed from v2 series. I still feel this is a problem to -access journal pipo without any protection, but this fix is limited and -I need to think about a more thoughtful way to fix. - -Any review comment or suggestion are warmly welcome. - -Thanks in advance for your help. - -Coly Li ---- - -Coly Li (16): - bcache: move definition of 'int ret' out of macro read_bucket() - bcache: never set 0 to KEY_PTRS of jouranl key in journal_reclaim() - bcache: reload jouranl key information during journal replay - bcache: fix journal deadlock during jouranl replay - bcache: reserve space for journal_meta() in run time - bcache: add failure check to run_cache_set() for journal replay - bcache: add comments for kobj release callback routine - bcache: return error immediately in bch_journal_replay() - bcache: add error check for calling register_bdev() - bcache: Add comments for blkdev_put() in registration code path - bcache: add comments for closure_fn to be called in closure_queue() - bcache: add pendings_cleanup to stop pending bcache device - bcache: fix fifo index swapping condition in btree_flush_write() - bcache: try to flush btree nodes as many as possible - bcache: improve bcache_reboot() - bcache: introduce spinlock_t flush_write_lock in struct journal - - drivers/md/bcache/journal.c | 312 ++++++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/journal.h | 8 +- - drivers/md/bcache/super.c | 112 ++++++++++++++-- - 3 files changed, 393 insertions(+), 39 deletions(-) - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch b/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch deleted file mode 100644 index 6f5e2da..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch +++ /dev/null @@ -1,50 +0,0 @@ -From b6bbfb503e206f65196dc44c7f3ca7f77458b8e0 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 18:10:48 +0800 -Subject: [RFC PATCH v2 01/16] bcache: move definition of 'int ret' out of - macro read_bucket() - -'int ret' is defined as a local variable inside macro read_bucket(). -Since this macro is called multiple times, and following patches will -use a 'int ret' variable in bch_journal_read(), this patch moves -definition of 'int ret' from macro read_bucket() to range of function -bch_journal_read(). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index b2fd412715b1..6e18057d1d82 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - { - #define read_bucket(b) \ - ({ \ -- int ret = journal_read_bucket(ca, list, b); \ -+ ret = journal_read_bucket(ca, list, b); \ - __set_bit(b, bitmap); \ - if (ret < 0) \ - return ret; \ -@@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - - struct cache *ca; - unsigned int iter; -+ int ret = 0; - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; -@@ -267,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - struct journal_replay, - list)->j.seq; - -- return 0; -+ return ret; - #undef read_bucket - } - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch b/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch deleted file mode 100644 index fcb490d..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch +++ /dev/null @@ -1,94 +0,0 @@ -From dc171a41dbbac4a43cd9503a18c92c7a31185ac7 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:22:23 +0800 -Subject: [RFC PATCH v2 02/16] bcache: never set 0 to KEY_PTRS of jouranl key - in journal_reclaim() - -In journal_reclaim() ja->cur_idx of each cache will be update to -reclaim available journal buckets. Variable 'int n' is used to count how -many cache is successfully reclaimed, then n is set to c->journal.key -by SET_KEY_PTRS(). Later in journal_write_unlocked(), a for_each_cache() -loop will write the jset data onto each cache. - -The problem is, if all jouranl buckets on each cache is full, the -following code in journal_reclaim(), - -529 for_each_cache(ca, c, iter) { -530 struct journal_device *ja = &ca->journal; -531 unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; -532 -533 /* No space available on this device */ -534 if (next == ja->discard_idx) -535 continue; -536 -537 ja->cur_idx = next; -538 k->ptr[n++] = MAKE_PTR(0, -539 bucket_to_sector(c, ca->sb.d[ja->cur_idx]), -540 ca->sb.nr_this_dev); -541 } -542 -543 bkey_init(k); -544 SET_KEY_PTRS(k, n); - -If there is no available bucket to reclaim, the if() condition at line -534 will always true, and n remains 0. Then at line 544, SET_KEY_PTRS() -will set KEY_PTRS field of c->journal.key to 0. - -Setting KEY_PTRS field of c->journal.key to 0 is wrong. Because in -journal_write_unlocked() the journal data is written in following loop, - -649 for (i = 0; i < KEY_PTRS(k); i++) { -650-671 submit journal data to cache device -672 } - -If KEY_PTRS field is set to 0 in jouranl_reclaim(), the journal data -won't be written to cache device here. If system crahed or rebooted -before bkeys of the lost journal entries written into btree nodes, data -corruption will be reported during bcache reload after rebooting the -system. - -Indeed there is only one cache in a cache set, there is no need to set -KEY_PTRS field in journal_reclaim() at all. But in order to keep the -for_each_cache() logic consistent for now, this patch fixes the above -problem by not setting 0 KEY_PTRS of journal key, if there is no bucket -available to reclaim. - -Cc: stable@vger.kernel.org -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 11 +++++++---- - 1 file changed, 7 insertions(+), 4 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 6e18057d1d82..5180bed911ef 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -541,11 +541,11 @@ static void journal_reclaim(struct cache_set *c) - ca->sb.nr_this_dev); - } - -- bkey_init(k); -- SET_KEY_PTRS(k, n); -- -- if (n) -+ if (n) { -+ bkey_init(k); -+ SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; -+ } - out: - if (!journal_full(&c->journal)) - __closure_wake_up(&c->journal.wait); -@@ -672,6 +672,9 @@ static void journal_write_unlocked(struct closure *cl) - ca->journal.seq[ca->journal.cur_idx] = w->data->seq; - } - -+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */ -+ BUG_ON(i == 0); -+ - atomic_dec_bug(&fifo_back(&c->journal.pin)); - bch_journal_next(&c->journal); - journal_reclaim(c); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch deleted file mode 100644 index cfe5323..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch +++ /dev/null @@ -1,161 +0,0 @@ -From e3c194808a99446e9bf69ac0707c7d3f473be518 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:32:22 +0800 -Subject: [RFC PATCH v2 03/16] bcache: reload jouranl key information during - journal replay - -When bcache journal initiates during running cache set, cache set -journal.blocks_free is initiated as 0. Then during journal replay if -journal_meta() is called and an empty jset is written to cache device, -journal_reclaim() is called. If there is available journal bucket to -reclaim, c->journal.blocks_free is set to numbers of blocks of a journal -bucket, which is c->sb.bucket_size >> c->block_bits. - -Most of time the above process works correctly, expect the condtion -when journal space is almost full. "Almost full" means there is no free -journal bucket, but there are still free blocks in last available -bucket indexed by ja->cur_idx. - -If system crashes or reboots when journal space is almost full, problem -comes. During cache set reload after the reboot, c->journal.blocks_free -is initialized as 0, when jouranl replay process writes bcache jouranl, -journal_reclaim() will be called to reclaim available journal bucket and -set c->journal.blocks_free to c->sb.bucket_size >> c->block_bits. But -there is no fully free bucket to reclaim in journal_reclaim(), so value -of c->journal.blocks_free will keep 0. If the first journal entry -processed by journal_replay() causes btree split and requires writing -journal space by journal_meta(), journal_meta() has to go into an -infinite loop to reclaim jouranl bucket, and blocks the whole cache set -to run. - -Such buggy situation can be solved if we do following things before -journal replay starts, -- Recover previous value of c->journal.blocks_free in last run time, - and set it to current c->journal.blocks_free as initial value. -- Recover previous value of ja->cur_idx in last run time, and set it to - KEY_PTR of current c->journal.key as initial value. - -After c->journal.blocks_free and c->journal.key are recovered, in -condition when jouranl space is almost full and cache set is reloaded, -meta journal entry from journal reply can be written into free blocks of -the last available journal bucket, then old jouranl entries can be -replayed and reclaimed for further journaling request. - -This patch adds bch_journal_key_reload() to recover journal blocks_free -and key ptr value for above purpose. bch_journal_key_reload() is called -in bch_journal_read() before replying journal by bch_journal_replay(). - -Cc: stable@vger.kernel.org -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 87 insertions(+) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 5180bed911ef..a6deb16c15c8 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -143,6 +143,89 @@ reread: left = ca->sb.bucket_size - offset; - return ret; - } - -+static int bch_journal_key_reload(struct cache_set *c) -+{ -+ struct cache *ca; -+ unsigned int iter, n = 0; -+ struct bkey *k = &c->journal.key; -+ int ret = 0; -+ -+ for_each_cache(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ struct bio *bio = &ja->bio; -+ struct jset *j, *data = c->journal.w[0].data; -+ struct closure cl; -+ unsigned int len, left; -+ unsigned int offset = 0, used_blocks = 0; -+ sector_t bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]); -+ -+ closure_init_stack(&cl); -+ -+ while (offset < ca->sb.bucket_size) { -+reread: left = ca->sb.bucket_size - offset; -+ len = min_t(unsigned int, -+ left, PAGE_SECTORS << JSET_BITS); -+ -+ bio_reset(bio); -+ bio->bi_iter.bi_sector = bucket + offset; -+ bio_set_dev(bio, ca->bdev); -+ bio->bi_iter.bi_size = len << 9; -+ -+ bio->bi_end_io = journal_read_endio; -+ bio->bi_private = &cl; -+ bio_set_op_attrs(bio, REQ_OP_READ, 0); -+ bch_bio_map(bio, data); -+ -+ closure_bio_submit(c, bio, &cl); -+ closure_sync(&cl); -+ -+ j = data; -+ while (len) { -+ size_t blocks, bytes = set_bytes(j); -+ -+ if (j->magic != jset_magic(&ca->sb)) -+ goto out; -+ -+ if (bytes > left << 9 || -+ bytes > PAGE_SIZE << JSET_BITS) { -+ pr_err("jset may be correpted: too big"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (bytes > len << 9) -+ goto reread; -+ -+ if (j->csum != csum_set(j)) { -+ pr_err("jset may be corrupted: bad csum"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ blocks = set_blocks(j, block_bytes(c)); -+ used_blocks += blocks; -+ -+ offset += blocks * ca->sb.block_size; -+ len -= blocks * ca->sb.block_size; -+ j = ((void *) j) + blocks * block_bytes(ca); -+ } -+ } -+out: -+ c->journal.blocks_free = -+ (c->sb.bucket_size >> c->block_bits) - -+ used_blocks; -+ -+ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev); -+ } -+ -+ BUG_ON(n == 0); -+ bkey_init(k); -+ SET_KEY_PTRS(k, n); -+ -+err: -+ return ret; -+} -+ - int bch_journal_read(struct cache_set *c, struct list_head *list) - { - #define read_bucket(b) \ -@@ -268,6 +351,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) - struct journal_replay, - list)->j.seq; - -+ /* Initial value of c->journal.blocks_free should be 0 */ -+ BUG_ON(c->journal.blocks_free != 0); -+ ret = bch_journal_key_reload(c); -+ - return ret; - #undef read_bucket - } --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch deleted file mode 100644 index 39b9873..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch +++ /dev/null @@ -1,276 +0,0 @@ -From 97898c33b4126381cb08f8560623325cc23291e5 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 27 Feb 2019 20:35:02 +0800 -Subject: [RFC PATCH v2 04/16] bcache: fix journal deadlock during jouranl - replay - -A deadlock of bcache jouranling may happen during journal replay. Such -deadlock happens when, -- Journal space is totally full (no any free blocks) and system crashes - or reboots. -- After reboot, the first journal entry handled by jouranl replay causes - btree split and jouranl_meta() is called to write an empty jset to - journal space. -- There is no journal space to write and journal_reclaim() fails to get - any available bucket because this is the first replayed journal entry - to be blocked. -Then the whole cache set is blocked from running. - -This patch is an effort to fix such journal replay deadlock in a simpler -way, -- Add a bool varialbe 'in_replay' in struct journal, set it to true when - journal replay starts, and set it to false when journal replay - completed. in_replay is initialized to be false. -- Reserve 6 sectors in journal bucket, do not use them in normal bcache - runtime. These sectors are only permitted to use during journal - replay (when c->jouranl.in_replay is true) - -Then in normal bcache runtime, journal space won't be totally full and -there are 6 sectors are always reserved for journal replay time. After -system reboots, if bch_btree_insert() in bch_journal_replay() causes -btree split and bch_journal_beta() gets called to require 1 sector -from journal buckets to write an empty jset, there are enough reserved -space to serve. - -The reason to reserve 6 sectors is, we should choose a number that won't -fix into a bucket size. If the reserved space happens to be a whole -bucket, more logic has to be added in journal_replay() to handle -journal.blocks_free with reserved spaces in journal replay time. This is -why 6 sectors is choosed, it is 3KB and won't be any proper block size -or bucket size. - -The bcache btree node size is quite large, so btree node split won't be -a frequent event. And when btree node split happens, new added key will -be insert directly into uppper level or neighbor nodes and won't go into -journal again, only bch_journal_meta() is called to write jset metadata -which occupies 1 block in journal space. If blocksize is set to 4K size, -reserve 6 sectors indeed is 2 blocks, so there can be two continuously -btree splitting happen during journal replay, this is very very rare in -practice. As default blocksize is set to sector size, that equals to -6 blocks reserved. Contiously splitting the btree for 6 times in journal -replay is almost impossible, so the reserved space seems to be enough -in my humble opinion. - -If in future the reserved space turns out to be not enough, let's extend -it then. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 100 ++++++++++++++++++++++++++++++++++++++++---- - drivers/md/bcache/journal.h | 4 ++ - 2 files changed, 97 insertions(+), 7 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index a6deb16c15c8..c60a702f53a9 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -415,6 +415,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - uint64_t start = i->j.last_seq, end = i->j.seq, n = start; - struct keylist keylist; - -+ s->journal.in_replay = true; -+ - list_for_each_entry(i, list, list) { - BUG_ON(i->pin && atomic_read(i->pin) != 1); - -@@ -448,6 +450,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - pr_info("journal replay done, %i keys in %i entries, seq %llu", - keys, entries, end); - err: -+ s->journal.in_replay = false; - while (!list_empty(list)) { - i = list_first_entry(list, struct journal_replay, list); - list_del(&i->list); -@@ -577,6 +580,22 @@ static void do_journal_discard(struct cache *ca) - } - } - -+static inline bool last_available_journal_bucket(struct cache_set *c) -+{ -+ struct cache *ca; -+ unsigned int iter; -+ struct journal_device *ja; -+ -+ for_each_cache(ca, c, iter) { -+ ja = &ca->journal; -+ if (unlikely((ja->cur_idx + 1) % ca->sb.njournal_buckets == -+ ja->last_idx)) -+ return true; -+ } -+ -+ return false; -+} -+ - static void journal_reclaim(struct cache_set *c) - { - struct bkey *k = &c->journal.key; -@@ -584,6 +603,7 @@ static void journal_reclaim(struct cache_set *c) - uint64_t last_seq; - unsigned int iter, n = 0; - atomic_t p __maybe_unused; -+ bool last, do_wakeup = false; - - atomic_long_inc(&c->reclaim); - -@@ -606,8 +626,13 @@ static void journal_reclaim(struct cache_set *c) - for_each_cache(ca, c, iter) - do_journal_discard(ca); - -- if (c->journal.blocks_free) -+ last = last_available_journal_bucket(c); -+ if ((!last && c->journal.blocks_free) || -+ (last && (c->journal.blocks_free * c->sb.block_size) > -+ BCH_JOURNAL_RPLY_RESERVE)) { -+ do_wakeup = true; - goto out; -+ } - - /* - * Allocate: -@@ -632,9 +657,10 @@ static void journal_reclaim(struct cache_set *c) - bkey_init(k); - SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; -+ do_wakeup = true; - } - out: -- if (!journal_full(&c->journal)) -+ if (do_wakeup && !journal_full(&c->journal)) - __closure_wake_up(&c->journal.wait); - } - -@@ -692,6 +718,21 @@ static void journal_write_unlock(struct closure *cl) - spin_unlock(&c->journal.lock); - } - -+static bool should_reclaim(struct cache_set *c, -+ struct journal_write *w) -+{ -+ if (unlikely(journal_full(&c->journal))) -+ return true; -+ -+ if (unlikely(last_available_journal_bucket(c) && -+ (!c->journal.in_replay) && -+ (c->journal.blocks_free * c->sb.block_size <= -+ BCH_JOURNAL_RPLY_RESERVE))) -+ return true; -+ -+ return false; -+} -+ - static void journal_write_unlocked(struct closure *cl) - __releases(c->journal.lock) - { -@@ -710,7 +751,7 @@ static void journal_write_unlocked(struct closure *cl) - if (!w->need_write) { - closure_return_with_destructor(cl, journal_write_unlock); - return; -- } else if (journal_full(&c->journal)) { -+ } else if (should_reclaim(c, w)) { - journal_reclaim(c); - spin_unlock(&c->journal.lock); - -@@ -798,6 +839,52 @@ static void journal_try_write(struct cache_set *c) - } - } - -+static bool no_journal_wait(struct cache_set *c, -+ size_t sectors) -+{ -+ bool last = last_available_journal_bucket(c); -+ size_t reserved_sectors = 0; -+ size_t n = min_t(size_t, -+ c->journal.blocks_free * c->sb.block_size, -+ PAGE_SECTORS << JSET_BITS); -+ -+ if (last && !c->journal.in_replay) -+ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ -+ if (sectors <= (n - reserved_sectors)) -+ return true; -+ -+ return false; -+} -+ -+static bool should_try_write(struct cache_set *c, -+ struct journal_write *w) -+{ -+ size_t reserved_sectors, n, sectors; -+ -+ if (journal_full(&c->journal)) -+ return false; -+ -+ if (!last_available_journal_bucket(c)) -+ return true; -+ -+ /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ -+ if (w->data->keys == 0) -+ return false; -+ -+ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ n = min_t(size_t, -+ (c->journal.blocks_free * c->sb.block_size), -+ PAGE_SECTORS << JSET_BITS); -+ sectors = __set_blocks(w->data, w->data->keys, -+ block_bytes(c)) * c->sb.block_size; -+ if (sectors <= (n - reserved_sectors)) -+ return true; -+ -+ return false; -+} -+ -+ - static struct journal_write *journal_wait_for_write(struct cache_set *c, - unsigned int nkeys) - __acquires(&c->journal.lock) -@@ -816,15 +903,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, - sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; - -- if (sectors <= min_t(size_t, -- c->journal.blocks_free * c->sb.block_size, -- PAGE_SECTORS << JSET_BITS)) -+ if (no_journal_wait(c, sectors)) - return w; - - if (wait) - closure_wait(&c->journal.wait, &cl); - -- if (!journal_full(&c->journal)) { -+ if (should_try_write(c, w)) { - if (wait) - trace_bcache_journal_entry_full(c); - -@@ -933,6 +1018,7 @@ int bch_journal_alloc(struct cache_set *c) - INIT_DELAYED_WORK(&j->work, journal_write_work); - - c->journal_delay_ms = 100; -+ j->in_replay = false; - - j->w[0].c = c; - j->w[1].c = c; -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 66f0facff84b..54408e248a39 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -108,6 +108,7 @@ struct journal { - struct closure io; - int io_in_flight; - struct delayed_work work; -+ bool in_replay; - - /* Number of blocks free in the bucket(s) we're currently writing to */ - unsigned int blocks_free; -@@ -159,6 +160,9 @@ struct journal_device { - - #define JOURNAL_PIN 20000 - -+/* Reserved jouranl space in sectors */ -+#define BCH_JOURNAL_RPLY_RESERVE 6U -+ - #define journal_full(j) \ - (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch deleted file mode 100644 index 07050e9..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch +++ /dev/null @@ -1,241 +0,0 @@ -From 4d3d26818916654397a930e8ce082b650dc809eb Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Thu, 28 Feb 2019 20:29:00 +0800 -Subject: [RFC PATCH v2 05/16] bcache: reserve space for journal_meta() in run - time - -Another journal deadlock of bcache jouranling can happen in normal -bcache runtime. It is very rare to happen but there are people report -bkey insert work queue blocked which caused by such deadlock. - -This is how such jouranling deadlock in runtime happens, -- Journal space is totally full and no free space to reclaim, jouranling - tasks waiting for space to write in journal_wait_for_write(). -- In order to have free journal space, btree_flush_write() is called to - flush earlest journaled in-memory btree key into btree node. Then all - journaled bkey in early used journal buckets are flushed to on-disk - btree, this journal bucket can be reclaimed for new coming jouranl - request. -- But if the earlest jouranled bkey causes a btree node split during - insert it into btree node, finally journal_meta() will be called to - journal btree root (and other information) into the journal space. -- Unfortunately the journal space is full, and the jouranl entries has - to be flushed in linear turn. So bch_journal_meta() from bkey insert - is blocked too. -Then jouranling deadlock during bcache run time happens. - -A method to fix such deadlock is to reserve some journal space too. The -reserved space can only be used when, -- Current journal bucket is the last journal bucket which has available - space to write into. -- When calling bch_journal(), current jset is empty and there is no key - in the inserting key list. This means the journal request if from - bch_journal_meta() and no non-reserved space can be used. - -Then if such journaling request is from bch_journal_meta() of inserting -the earlest journaled bkey back into btree, the deadlock condition won't -happen any more because the reserved space can be used for such -scenario. - -Since there are already 6 sectors reserved for journal replay, here we -reserve 7 sectors for runtime meta journal from btree split caused by -flushing journal entries back to btree node. Depends on block size from -1 sector to 4KB, the reserved space can serve for form 7 to 2 journal -blocks. Indeed only one journal block reserved for such journal deadlock -scenario is enough, 2 continuous btree splits cause by two adjoin bkey -flushing from journal is very very rare to happen. So reserve 7 sectors -should works. - -Another reason for reserving 7 sectors is, there are already 6 sectors -reserved fo journal repley, so in total there are 13 sectors reserved in -last available journal bucket. 13 sectors won't be a proper bucket size, -so we don't need to add more code to handle journal.blocks_free -initialization for whole reserved jouranl bucket. Even such code logic -is simple, less code is better in my humble opinion. - -Again, if in future the reserved space turns out to be not enough, let's -extend it then. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 89 +++++++++++++++++++++++++++++++++------------ - drivers/md/bcache/journal.h | 1 + - 2 files changed, 66 insertions(+), 24 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index c60a702f53a9..6aa68ab7cd78 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -629,7 +629,7 @@ static void journal_reclaim(struct cache_set *c) - last = last_available_journal_bucket(c); - if ((!last && c->journal.blocks_free) || - (last && (c->journal.blocks_free * c->sb.block_size) > -- BCH_JOURNAL_RPLY_RESERVE)) { -+ (BCH_JOURNAL_RESERVE + BCH_JOURNAL_RPLY_RESERVE))) { - do_wakeup = true; - goto out; - } -@@ -718,18 +718,27 @@ static void journal_write_unlock(struct closure *cl) - spin_unlock(&c->journal.lock); - } - --static bool should_reclaim(struct cache_set *c, -- struct journal_write *w) -+static inline bool should_reclaim(struct cache_set *c, -+ struct journal_write *w) - { -- if (unlikely(journal_full(&c->journal))) -- return true; -+ bool last = last_available_journal_bucket(c); - -- if (unlikely(last_available_journal_bucket(c) && -- (!c->journal.in_replay) && -- (c->journal.blocks_free * c->sb.block_size <= -- BCH_JOURNAL_RPLY_RESERVE))) -+ if (!last && journal_full(&c->journal)) - return true; - -+ if (unlikely(last)) { -+ size_t n = c->journal.blocks_free * c->sb.block_size; -+ -+ if (!c->journal.in_replay) { -+ if (n <= BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE) -+ return true; -+ } else { -+ if (n <= BCH_JOURNAL_RPLY_RESERVE) -+ return true; -+ } -+ } -+ - return false; - } - -@@ -751,7 +760,9 @@ static void journal_write_unlocked(struct closure *cl) - if (!w->need_write) { - closure_return_with_destructor(cl, journal_write_unlock); - return; -- } else if (should_reclaim(c, w)) { -+ } -+ -+ if (should_reclaim(c, w)) { - journal_reclaim(c); - spin_unlock(&c->journal.lock); - -@@ -840,16 +851,26 @@ static void journal_try_write(struct cache_set *c) - } - - static bool no_journal_wait(struct cache_set *c, -- size_t sectors) -+ size_t sectors, -+ int nkeys) - { -+ bool is_journal_meta = (nkeys == 0) ? true : false; - bool last = last_available_journal_bucket(c); - size_t reserved_sectors = 0; -- size_t n = min_t(size_t, -- c->journal.blocks_free * c->sb.block_size, -- PAGE_SECTORS << JSET_BITS); -+ size_t n; -+ -+ if (unlikely(last)) { -+ if (!is_journal_meta) -+ reserved_sectors = BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE; -+ else -+ reserved_sectors = (!c->journal.in_replay) ? -+ BCH_JOURNAL_RPLY_RESERVE : 0; -+ } - -- if (last && !c->journal.in_replay) -- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ n = min_t(size_t, -+ c->journal.blocks_free * c->sb.block_size, -+ PAGE_SECTORS << JSET_BITS); - - if (sectors <= (n - reserved_sectors)) - return true; -@@ -858,26 +879,46 @@ static bool no_journal_wait(struct cache_set *c, - } - - static bool should_try_write(struct cache_set *c, -- struct journal_write *w) -+ struct journal_write *w, -+ int nkeys) - { - size_t reserved_sectors, n, sectors; -+ bool last, empty_jset; - - if (journal_full(&c->journal)) - return false; - -- if (!last_available_journal_bucket(c)) -+ last = last_available_journal_bucket(c); -+ empty_jset = (w->data->keys == 0) ? true : false; -+ -+ if (!last) { -+ /* -+ * Not last available journal bucket, no reserved journal -+ * space restriction, an empty jset should not be here. -+ */ -+ BUG_ON(empty_jset); - return true; -+ } - -- /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ -- if (w->data->keys == 0) -+ if (empty_jset) { -+ /* -+ * If nkeys is 0 it means the journaling request is for meta -+ * data, which should be returned in journal_wait_for_write() -+ * by checking no_journal_wait(), and won't get here. -+ */ -+ BUG_ON(nkeys == 0); - return false; -+ } - -- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; -+ reserved_sectors = BCH_JOURNAL_RESERVE + -+ BCH_JOURNAL_RPLY_RESERVE; - n = min_t(size_t, - (c->journal.blocks_free * c->sb.block_size), - PAGE_SECTORS << JSET_BITS); -- sectors = __set_blocks(w->data, w->data->keys, -+ sectors = __set_blocks(w->data, -+ w->data->keys, - block_bytes(c)) * c->sb.block_size; -+ - if (sectors <= (n - reserved_sectors)) - return true; - -@@ -903,13 +944,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, - sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; - -- if (no_journal_wait(c, sectors)) -+ if (no_journal_wait(c, sectors, nkeys)) - return w; - - if (wait) - closure_wait(&c->journal.wait, &cl); - -- if (should_try_write(c, w)) { -+ if (should_try_write(c, w, nkeys)) { - if (wait) - trace_bcache_journal_entry_full(c); - -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 54408e248a39..55f81443f304 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -162,6 +162,7 @@ struct journal_device { - - /* Reserved jouranl space in sectors */ - #define BCH_JOURNAL_RPLY_RESERVE 6U -+#define BCH_JOURNAL_RESERVE 7U - - #define journal_full(j) \ - (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch b/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch deleted file mode 100644 index 47fee81..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 7d1f183bf68623c2bea6ec5c41d091a65e426e47 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 13 Mar 2019 21:57:18 +0800 -Subject: [RFC PATCH v2 06/16] bcache: add failure check to run_cache_set() for - journal replay - -Currently run_cache_set() has no return value, if there is failure in -bch_journal_replay(), the caller of run_cache_set() has no idea about -such failure and just continue to execute following code after -run_cache_set(). The internal failure is triggered inside -bch_journal_replay() and being handled in async way. This behavior is -inefficient, while failure handling inside bch_journal_replay(), cache -register code is still running to start the cache set. Registering and -unregistering code running as same time may introduce some rare race -condition, and make the code to be more hard to be understood. - -This patch adds return value to run_cache_set(), and returns -EIO if -bch_journal_rreplay() fails. Then caller of run_cache_set() may detect -such failure and stop registering code flow immedidately inside -register_cache_set(). - -If journal replay fails, run_cache_set() can report error immediately -to register_cache_set(). This patch makes the failure handling for -bch_journal_replay() be in synchronized way, easier to understand and -debug, and avoid poetential race condition for register-and-unregister -in same time. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 17 ++++++++++++----- - 1 file changed, 12 insertions(+), 5 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a697a3a923cd..036bffad0bfe 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1775,7 +1775,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) - return NULL; - } - --static void run_cache_set(struct cache_set *c) -+static int run_cache_set(struct cache_set *c) - { - const char *err = "cannot allocate memory"; - struct cached_dev *dc, *t; -@@ -1869,7 +1869,9 @@ static void run_cache_set(struct cache_set *c) - if (j->version < BCACHE_JSET_VERSION_UUID) - __uuid_write(c); - -- bch_journal_replay(c, &journal); -+ err = "bcache: replay journal failed"; -+ if (bch_journal_replay(c, &journal)) -+ goto err; - } else { - pr_notice("invalidating existing data"); - -@@ -1937,11 +1939,13 @@ static void run_cache_set(struct cache_set *c) - flash_devs_run(c); - - set_bit(CACHE_SET_RUNNING, &c->flags); -- return; -+ return 0; - err: - closure_sync(&cl); - /* XXX: test this, it's broken */ - bch_cache_set_error(c, "%s", err); -+ -+ return -EIO; - } - - static bool can_attach_cache(struct cache *ca, struct cache_set *c) -@@ -2005,8 +2009,11 @@ static const char *register_cache_set(struct cache *ca) - ca->set->cache[ca->sb.nr_this_dev] = ca; - c->cache_by_alloc[c->caches_loaded++] = ca; - -- if (c->caches_loaded == c->sb.nr_in_set) -- run_cache_set(c); -+ if (c->caches_loaded == c->sb.nr_in_set) { -+ err = "failed to run cache set"; -+ if (run_cache_set(c) < 0) -+ goto err; -+ } - - return NULL; - err: --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch b/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch deleted file mode 100644 index c675a6d..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 79d3266fac98e11fab0d044f82decc1491344f74 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 13 Mar 2019 22:39:37 +0800 -Subject: [RFC PATCH v2 07/16] bcache: add comments for kobj release callback - routine - -Bcache has several routines to release resources in implicit way, they -are called when the associated kobj released. This patch adds code -comments to notice when and which release callback will be called, -- When dc->disk.kobj released: - void bch_cached_dev_release(struct kobject *kobj) -- When d->kobj released: - void bch_flash_dev_release(struct kobject *kobj) -- When c->kobj released: - void bch_cache_set_release(struct kobject *kobj) -- When ca->kobj released - void bch_cache_release(struct kobject *kobj) - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 036bffad0bfe..400af446c372 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1174,6 +1174,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, - return 0; - } - -+/* when dc->disk.kobj released */ - void bch_cached_dev_release(struct kobject *kobj) - { - struct cached_dev *dc = container_of(kobj, struct cached_dev, -@@ -1326,6 +1327,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, - - /* Flash only volumes */ - -+/* When d->kobj released */ - void bch_flash_dev_release(struct kobject *kobj) - { - struct bcache_device *d = container_of(kobj, struct bcache_device, -@@ -1496,6 +1498,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) - return true; - } - -+/* When c->kobj released */ - void bch_cache_set_release(struct kobject *kobj) - { - struct cache_set *c = container_of(kobj, struct cache_set, kobj); -@@ -2023,6 +2026,7 @@ static const char *register_cache_set(struct cache *ca) - - /* Cache device */ - -+/* When ca->kobj released */ - void bch_cache_release(struct kobject *kobj) - { - struct cache *ca = container_of(kobj, struct cache, kobj); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch b/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch deleted file mode 100644 index 01f188c..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 4bec08de9304ae05a5a934708813bdc61dc41f1e Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 13 Mar 2019 22:52:31 +0800 -Subject: [RFC PATCH v2 08/16] bcache: return error immediately in - bch_journal_replay() - -When failure happens inside bch_journal_replay(), calling -cache_set_err_on() and handling the failure in async way is not a good -idea. Because after bch_journal_replay() returns, registering code will -continue to execute following steps, and unregistering code triggered -by cache_set_err_on() is running in same time. First it is unnecessary -to handle failure and unregister cache set in an async way, second there -might be potential race condition to run register and unregister code -for same cache set. - -So in this patch, if failure happens in bch_journal_replay(), we don't -call cache_set_err_on(), and just print out the same error message to -kernel message buffer, then return -EIO immediately caller. Then caller -can detect such failure and handle it in synchrnozied way. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 9 ++++++--- - 1 file changed, 6 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 6aa68ab7cd78..bdb6f9cefe48 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -420,9 +420,12 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - list_for_each_entry(i, list, list) { - BUG_ON(i->pin && atomic_read(i->pin) != 1); - -- cache_set_err_on(n != i->j.seq, s, --"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", -- n, i->j.seq - 1, start, end); -+ if (n != i->j.seq) { -+ pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", -+ n, i->j.seq - 1, start, end); -+ ret = -EIO; -+ goto err; -+ } - - for (k = i->j.start; - k < bset_bkey_last(&i->j); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch b/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch deleted file mode 100644 index 4d342e2..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch +++ /dev/null @@ -1,91 +0,0 @@ -From bb554ecefc017bdaa6aeb717010a8fa97036da51 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 19 Mar 2019 12:27:53 +0800 -Subject: [RFC PATCH v2 09/16] bcache: add error check for calling - register_bdev() - -This patch adds return value to register_bdev(). Then if failure happens -inside register_bdev(), its caller register_bcache() may detect and -handle the failure more properly. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 16 ++++++++++------ - 1 file changed, 10 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 400af446c372..a435c506edba 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -1281,7 +1281,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) - - /* Cached device - bcache superblock */ - --static void register_bdev(struct cache_sb *sb, struct page *sb_page, -+static int register_bdev(struct cache_sb *sb, struct page *sb_page, - struct block_device *bdev, - struct cached_dev *dc) - { -@@ -1319,10 +1319,11 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, - BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); - -- return; -+ return 0; - err: - pr_notice("error %s: %s", dc->backing_dev_name, err); - bcache_device_stop(&dc->disk); -+ return -EIO; - } - - /* Flash only volumes */ -@@ -2273,7 +2274,7 @@ static bool bch_is_open(struct block_device *bdev) - static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - const char *buffer, size_t size) - { -- ssize_t ret = size; -+ ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; - char *path = NULL; - struct cache_sb *sb = NULL; -@@ -2307,7 +2308,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - if (!IS_ERR(bdev)) - bdput(bdev); - if (attr == &ksysfs_register_quiet) -- goto out; -+ goto quiet_out; - } - goto err; - } -@@ -2328,8 +2329,10 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - goto err_close; - - mutex_lock(&bch_register_lock); -- register_bdev(sb, sb_page, bdev, dc); -+ ret = register_bdev(sb, sb_page, bdev, dc); - mutex_unlock(&bch_register_lock); -+ if (ret < 0) -+ goto err; - } else { - struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - -@@ -2339,6 +2342,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; - } -+quiet_out: -+ ret = size; - out: - if (sb_page) - put_page(sb_page); -@@ -2351,7 +2356,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - err: - pr_info("error %s: %s", path, err); -- ret = -EINVAL; - goto out; - } - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch b/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch deleted file mode 100644 index 191177d..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch +++ /dev/null @@ -1,51 +0,0 @@ -From f4a737b08d573035889cbf3c70cdde528117a2cd Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 19 Mar 2019 12:29:52 +0800 -Subject: [RFC PATCH v2 10/16] bcache: Add comments for blkdev_put() in - registration code path - -Add comments to explain why in register_bcache() blkdev_put() won't -be called in two location. Add comments to explain why blkdev_put() -must be called in register_cache() when cache_alloc() failed. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index a435c506edba..83a7cb0e0e45 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2191,6 +2191,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, - - ret = cache_alloc(ca); - if (ret != 0) { -+ /* -+ * If we failed here, it means ca->kobj is not initialzed yet, -+ * kobject_put() won't be called and there is no chance to -+ * call blkdev_put() to bdev in bch_cache_release(). So we -+ * explictly call blkdev_put() here. -+ */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - if (ret == -ENOMEM) - err = "cache_alloc(): -ENOMEM"; -@@ -2331,6 +2337,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_page, bdev, dc); - mutex_unlock(&bch_register_lock); -+ /* blkdev_put() will be called in cached_dev_free() */ - if (ret < 0) - goto err; - } else { -@@ -2339,6 +2346,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - if (!ca) - goto err_close; - -+ /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; - } --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch b/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch deleted file mode 100644 index 3b0c2e3..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch +++ /dev/null @@ -1,42 +0,0 @@ -From ca49b08f0e1e634bb5082413ee34b4d8080e0d38 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 19 Mar 2019 18:58:47 +0800 -Subject: [RFC PATCH v2 11/16] bcache: add comments for closure_fn to be called - in closure_queue() - -Add code comments to explain which call back function might be called -for the closure_queue(). This is an effort to make code to be more -understandable for readers. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 83a7cb0e0e45..9b41e0b62cc0 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = { - void bcache_device_stop(struct bcache_device *d) - { - if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) -+ /* -+ * closure_fn set to -+ * - cached device: cached_dev_flush() -+ * - flash dev: flash_dev_flush() -+ */ - closure_queue(&d->cl); - } - -@@ -1677,6 +1682,7 @@ static void __cache_set_unregister(struct closure *cl) - void bch_cache_set_stop(struct cache_set *c) - { - if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) -+ /* closure_fn set to __cache_set_unregister() */ - closure_queue(&c->caching); - } - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch b/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch deleted file mode 100644 index d81c648..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch +++ /dev/null @@ -1,107 +0,0 @@ -From 6da8faaaf5e2ecd2fb3d11ae6bd8ab8ee19b39bc Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Wed, 20 Mar 2019 23:11:59 +0800 -Subject: [RFC PATCH v2 12/16] bcache: add pendings_cleanup to stop pending - bcache device - -If a bcache device is in dirty state and its cache set is not -registered, this bcache deivce will not appear in /dev/bcache<N>, -and there is no way to stop it or remove the bcache kernel module. - -This is an as-designed behavior, but sometimes people has to reboot -whole system to release or stop the pending backing device. - -This sysfs interface may remove such pending bcache devices when -write anything into the sysfs file manually. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 55 insertions(+) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 9b41e0b62cc0..e988e46a6479 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2246,9 +2246,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, - - static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - const char *buffer, size_t size); -+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, -+ struct kobj_attribute *attr, -+ const char *buffer, size_t size); - - kobj_attribute_write(register, register_bcache); - kobj_attribute_write(register_quiet, register_bcache); -+kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); - - static bool bch_is_open_backing(struct block_device *bdev) - { -@@ -2373,6 +2377,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, - goto out; - } - -+ -+struct pdev { -+ struct list_head list; -+ struct cached_dev *dc; -+}; -+ -+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, -+ struct kobj_attribute *attr, -+ const char *buffer, -+ size_t size) -+{ -+ LIST_HEAD(pending_devs); -+ ssize_t ret = size; -+ struct cached_dev *dc, *tdc; -+ struct pdev *pdev, *tpdev; -+ struct cache_set *c, *tc; -+ -+ mutex_lock(&bch_register_lock); -+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { -+ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); -+ if (!pdev) -+ break; -+ pdev->dc = dc; -+ list_add(&pdev->list, &pending_devs); -+ } -+ -+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { -+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { -+ char *pdev_set_uuid = pdev->dc->sb.set_uuid; -+ char *set_uuid = c->sb.uuid; -+ -+ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { -+ list_del(&pdev->list); -+ kfree(pdev); -+ break; -+ } -+ } -+ } -+ mutex_unlock(&bch_register_lock); -+ -+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { -+ pr_info("delete pdev %p", pdev); -+ list_del(&pdev->list); -+ bcache_device_stop(&pdev->dc->disk); -+ kfree(pdev); -+ } -+ -+ return ret; -+} -+ - static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - { - if (code == SYS_DOWN || -@@ -2483,6 +2537,7 @@ static int __init bcache_init(void) - static const struct attribute *files[] = { - &ksysfs_register.attr, - &ksysfs_register_quiet.attr, -+ &ksysfs_pendings_cleanup.attr, - NULL - }; - --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch b/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch deleted file mode 100644 index d76c955..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch +++ /dev/null @@ -1,90 +0,0 @@ -From e6ac565cfb5676a9e833e62570fb8a9d786eda47 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sat, 23 Mar 2019 22:54:35 +0800 -Subject: [RFC PATCH v2 13/16] bcache: fix fifo index swapping condition in - btree_flush_write() - -Current journal_max_cmp() and journal_min_cmp() assume that smaller fifo -index indicating elder journal entries, but this is only true when fifo -index is not swapped. - -Fifo structure journal.pin is implemented by a cycle buffer, if the head -index reaches highest location of the cycle buffer, it will be swapped -to 0. Once the swapping happens, it means a smaller fifo index might be -associated to a newer journal entry. So the btree node with oldest -journal entry won't be selected by btree_flush_write() to flush out to -cache device. The result is, the oldest journal entries may always has -no chance to be written into cache device, and after a reboot -bch_journal_replay() may complain some journal entries are missing. - -This patch handles the fifo index swapping conditions properly, then in -btree_flush_write() the btree node with oldest journal entry can be -slected from c->flush_btree correctly. - -Cc: stable@vger.kernel.org -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 47 +++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 41 insertions(+), 6 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index bdb6f9cefe48..bc0e01151155 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -464,12 +464,47 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) - } - - /* Journalling */ --#define journal_max_cmp(l, r) \ -- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ -- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) --#define journal_min_cmp(l, r) \ -- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ -- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) -+#define journal_max_cmp(l, r) \ -+({ \ -+ int l_idx, r_idx, f_idx, b_idx; \ -+ bool _ret = true; \ -+ \ -+ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \ -+ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \ -+ f_idx = c->journal.pin.front; \ -+ b_idx = c->journal.pin.back; \ -+ \ -+ _ret = (l_idx < r_idx); \ -+ /* in case fifo back pointer is swapped */ \ -+ if (b_idx < f_idx) { \ -+ if (l_idx <= b_idx && r_idx >= f_idx) \ -+ _ret = false; \ -+ else if (l_idx >= f_idx && r_idx <= b_idx) \ -+ _ret = true; \ -+ } \ -+ _ret; \ -+}) -+ -+#define journal_min_cmp(l, r) \ -+({ \ -+ int l_idx, r_idx, f_idx, b_idx; \ -+ bool _ret = true; \ -+ \ -+ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \ -+ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \ -+ f_idx = c->journal.pin.front; \ -+ b_idx = c->journal.pin.back; \ -+ \ -+ _ret = (l_idx > r_idx); \ -+ /* in case fifo back pointer is swapped */ \ -+ if (b_idx < f_idx) { \ -+ if (l_idx <= b_idx && r_idx >= f_idx) \ -+ _ret = true; \ -+ else if (l_idx >= f_idx && r_idx <= b_idx) \ -+ _ret = false; \ -+ } \ -+ _ret; \ -+}) - - static void btree_flush_write(struct cache_set *c) - { --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch b/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch deleted file mode 100644 index 4955ef8..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch +++ /dev/null @@ -1,82 +0,0 @@ -From d5786e57fca69b65b4b334e34d9ec8033ed6721f Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 24 Mar 2019 00:06:05 +0800 -Subject: [RFC PATCH v2 14/16] bcache: try to flush btree nodes as many as - possible - -When btree_flush_write() is called, it means the journal space is -exhuasted already. Current code only selects a single btree node to -write out, which may introduce huge cache bounce from the spinlock on -multiple cpu cores, when a lot of kworkers on journaling code path to -call btree_flush_write() for journal space reclaiming. - -This patch tries to flush as many btree node as possible inside -a single call to btree_flush_write(), then the frequence of calling -btree_flush_write() can be reduced, which in turn reduces the cache -bounce from spinlock on multiple cpu cores. Please notice that this -patch does not reduce the total times of acquiring spinlock, a spin -lock is still acquired when select every single btree node to write -out, but this patch will try best to hold the spinlock on same cpu -core, which avoids the cache bounce where the spinlock is acquired by -multiple different cpu cores. - -After the patch applied, in my pressure testing, 'top' shows more than -50% sys cpu time reduced from the kworks which competing spinlock -inside btree_flush_write(). - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 7 ++++++- - drivers/md/bcache/journal.h | 4 ++-- - 2 files changed, 8 insertions(+), 3 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index bc0e01151155..8536e76fcac9 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -514,6 +514,7 @@ static void btree_flush_write(struct cache_set *c) - */ - struct btree *b; - int i; -+ int n = FLUSH_BTREE_HEAP; - - atomic_long_inc(&c->flush_write); - -@@ -552,6 +553,10 @@ static void btree_flush_write(struct cache_set *c) - - __bch_btree_node_write(b, NULL); - mutex_unlock(&b->write_lock); -+ -+ /* try to flush btree nodes as many as possible */ -+ if (--n > 0) -+ goto retry; - } - } - -@@ -1102,7 +1107,7 @@ int bch_journal_alloc(struct cache_set *c) - j->w[0].c = c; - j->w[1].c = c; - -- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || -+ if (!(init_heap(&c->flush_btree, FLUSH_BTREE_HEAP, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index 55f81443f304..a8be14c6f6d9 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -158,8 +158,8 @@ struct journal_device { - #define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) - --#define JOURNAL_PIN 20000 -- -+#define FLUSH_BTREE_HEAP 128 -+#define JOURNAL_PIN 20000 - /* Reserved jouranl space in sectors */ - #define BCH_JOURNAL_RPLY_RESERVE 6U - #define BCH_JOURNAL_RESERVE 7U --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch b/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch deleted file mode 100644 index 3c92f1d..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch +++ /dev/null @@ -1,50 +0,0 @@ -From a2b3bb8c5d68a17ee630a75dc4cf81df8eb7ef97 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 24 Mar 2019 12:50:50 +0800 -Subject: [RFC PATCH v2 15/16] bcache: improve bcache_reboot() - -This patch tries to release mutex bch_register_lock early, to give -chance to stop cache set and bcache device early. - -This patch also expends time out of stopping all bcache device from -2 seconds to 10 seconds, because stopping writeback rate update worker -may delay for 5 seconds, 2 seconds is not enough. - -After this patch applied, stopping bcache devices during system reboot -or shutdown is very hard to be observed any more. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/super.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index e988e46a6479..2d377a4a182f 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2453,10 +2453,13 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - list_for_each_entry_safe(dc, tdc, &uncached_devices, list) - bcache_device_stop(&dc->disk); - -+ mutex_unlock(&bch_register_lock); -+ - /* What's a condition variable? */ - while (1) { -- long timeout = start + 2 * HZ - jiffies; -+ long timeout = start + 10 * HZ - jiffies; - -+ mutex_lock(&bch_register_lock); - stopped = list_empty(&bch_cache_sets) && - list_empty(&uncached_devices); - -@@ -2468,7 +2471,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) - - mutex_unlock(&bch_register_lock); - schedule_timeout(timeout); -- mutex_lock(&bch_register_lock); - } - - finish_wait(&unregister_wait, &wait); --- -2.16.4 - diff --git a/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch b/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch deleted file mode 100644 index a3d6691..0000000 --- a/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 24539bb78565d784ddabb81f24968c13835eb000 Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Sun, 24 Mar 2019 23:55:27 +0800 -Subject: [RFC PATCH v2 16/16] bcache: introduce spinlock_t flush_write_lock in - struct journal - -In btree_flush_write(), iterating all cached btree nodes and adding them -into ordered heap c->flush_btree takes quite long time. In order to -protect ordered heap c->flush_btree, spin lock c->journal.lock is held -for all the iteration and heap ordering. When journal space is fully -occupied, btree_flush_write() might be called frequently, if the cached -btree node iteration takes too much time, kenrel will complain that -normal journal kworkers are blocked too long. Of cause write performance -drops at this moment. - -This patch introduces a new spin lock member in struct journal, named -flush_write_lock. This lock is only used in btree_flush_write() and -protect the ordered heap c->flush_btree during all the cached btree node -iteration. Then there won't be lock contention on c->journal.lock. - -After this fix, when journal space is fully occupied, it is very rare to -observe the journal kworker blocking timeout warning. - -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/journal.c | 5 +++-- - drivers/md/bcache/journal.h | 1 + - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c -index 8536e76fcac9..6e38470f6924 100644 ---- a/drivers/md/bcache/journal.c -+++ b/drivers/md/bcache/journal.c -@@ -519,7 +519,7 @@ static void btree_flush_write(struct cache_set *c) - atomic_long_inc(&c->flush_write); - - retry: -- spin_lock(&c->journal.lock); -+ spin_lock(&c->journal.flush_write_lock); - if (heap_empty(&c->flush_btree)) { - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { -@@ -540,7 +540,7 @@ static void btree_flush_write(struct cache_set *c) - - b = NULL; - heap_pop(&c->flush_btree, b, journal_min_cmp); -- spin_unlock(&c->journal.lock); -+ spin_unlock(&c->journal.flush_write_lock); - - if (b) { - mutex_lock(&b->write_lock); -@@ -1099,6 +1099,7 @@ int bch_journal_alloc(struct cache_set *c) - struct journal *j = &c->journal; - - spin_lock_init(&j->lock); -+ spin_lock_init(&j->flush_write_lock); - INIT_DELAYED_WORK(&j->work, journal_write_work); - - c->journal_delay_ms = 100; -diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h -index a8be14c6f6d9..d8ad99f6191b 100644 ---- a/drivers/md/bcache/journal.h -+++ b/drivers/md/bcache/journal.h -@@ -103,6 +103,7 @@ struct journal_write { - /* Embedded in struct cache_set */ - struct journal { - spinlock_t lock; -+ spinlock_t flush_write_lock; - /* used when waiting because the journal was full */ - struct closure_waitlist wait; - struct closure io; --- -2.16.4 - |