aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2022-05-22 00:50:52 +0800
committerColy Li <colyli@suse.de>2022-05-22 00:50:52 +0800
commit41347a6d6406e1297ae11c7eb003c0b284a25720 (patch)
tree8cb4c47e1ed9ed66babe5ebde0d684b4f0c92145
parent995eb52153c879646c1dedb21ff4d2683aa4966d (diff)
downloadbcache-patches-41347a6d6406e1297ae11c7eb003c0b284a25720.tar.gz
update for-test and for-next
-rw-r--r--for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch140
-rw-r--r--for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch132
-rw-r--r--for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch138
-rw-r--r--for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch148
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch343
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch543
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch359
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch309
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch252
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch67
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch48
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch60
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch255
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch231
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch181
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch84
-rw-r--r--for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch489
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch343
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch542
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch359
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch308
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch251
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch66
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch48
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch60
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch255
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch231
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch181
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch84
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch125
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch343
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch542
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch359
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch308
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch252
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch66
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch48
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch60
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch255
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch231
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch182
-rw-r--r--for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch84
-rw-r--r--for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch166
-rw-r--r--for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch92
-rw-r--r--for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch456
-rw-r--r--for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch662
-rw-r--r--for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch401
-rw-r--r--for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch177
-rw-r--r--for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch364
-rw-r--r--for-test/badblocks/v4/v4-0000-cover-letter.patch70
-rw-r--r--for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch91
-rw-r--r--for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch457
-rw-r--r--for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch661
-rw-r--r--for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch399
-rw-r--r--for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch175
-rw-r--r--for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch365
-rw-r--r--for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch (renamed from for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch)32
-rw-r--r--for-test/badblocks/v5/v5-0000-cover-letter.patch70
-rw-r--r--for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch91
-rw-r--r--for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch459
-rw-r--r--for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch663
-rw-r--r--for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch399
-rw-r--r--for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch175
-rw-r--r--for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch365
-rw-r--r--for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch2303
-rw-r--r--for-test/jouranl-deadlock/0001-reserve-journal-space.patch369
-rw-r--r--for-test/jouranl-deadlock/0002-more-fixes.patch131
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch (renamed from for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch (renamed from for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch (renamed from for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch)0
70 files changed, 18939 insertions, 16 deletions
diff --git a/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch
new file mode 100644
index 0000000..9fb59df
--- /dev/null
+++ b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch
@@ -0,0 +1,140 @@
+From ead990f754571c9492943b437014abab6894955c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 13:08:58 +0800
+Subject: [PATCH 1/4] bcache: improve multithreaded bch_btree_check()
+
+Commit 8e7102273f59 ("bcache: make bch_btree_check() to be
+multithreaded") makes bch_btree_check() to be much faster when checking
+all btree nodes during cache device registration. But it isn't in ideal
+shap yet, still can be improved.
+
+This patch does the following thing to improve current parallel btree
+nodes check by multiple threads in bch_btree_check(),
+- Add read lock to root node while checking all the btree nodes with
+ multiple threads. Although currently it is not mandatory but it is
+ good to have a read lock in code logic.
+- Remove local variable 'char name[32]', and generate kernel thread name
+ string directly when calling kthread_run().
+- Allocate local variable "struct btree_check_state check_state" on the
+ stack and avoid unnecessary dynamic memory allocation for it.
+- Increase check_state->started to count created kernel thread after it
+ succeeds to create.
+- When wait for all checking kernel threads to finish, use wait_event()
+ to replace wait_event_interruptible().
+
+With this change, the code is more clear, and some potential error
+conditions are avoided.
+
+Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/btree.c | 58 ++++++++++++++++++---------------------
+ 1 file changed, 26 insertions(+), 32 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index ad9f16689419..2362bb8ef6d1 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c)
+ int i;
+ struct bkey *k = NULL;
+ struct btree_iter iter;
+- struct btree_check_state *check_state;
+- char name[32];
++ struct btree_check_state check_state;
+
+ /* check and mark root node keys */
+ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+@@ -2018,63 +2017,58 @@ int bch_btree_check(struct cache_set *c)
+ if (c->root->level == 0)
+ return 0;
+
+- check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
+- if (!check_state)
+- return -ENOMEM;
+-
+- check_state->c = c;
+- check_state->total_threads = bch_btree_chkthread_nr();
+- check_state->key_idx = 0;
+- spin_lock_init(&check_state->idx_lock);
+- atomic_set(&check_state->started, 0);
+- atomic_set(&check_state->enough, 0);
+- init_waitqueue_head(&check_state->wait);
++ check_state.c = c;
++ check_state.total_threads = bch_btree_chkthread_nr();
++ check_state.key_idx = 0;
++ spin_lock_init(&check_state.idx_lock);
++ atomic_set(&check_state.started, 0);
++ atomic_set(&check_state.enough, 0);
++ init_waitqueue_head(&check_state.wait);
+
++ rw_lock(0, c->root, c->root->level);
+ /*
+ * Run multiple threads to check btree nodes in parallel,
+- * if check_state->enough is non-zero, it means current
++ * if check_state.enough is non-zero, it means current
+ * running check threads are enough, unncessary to create
+ * more.
+ */
+- for (i = 0; i < check_state->total_threads; i++) {
+- /* fetch latest check_state->enough earlier */
++ for (i = 0; i < check_state.total_threads; i++) {
++ /* fetch latest check_state.enough earlier */
+ smp_mb__before_atomic();
+- if (atomic_read(&check_state->enough))
++ if (atomic_read(&check_state.enough))
+ break;
+
+- check_state->infos[i].result = 0;
+- check_state->infos[i].state = check_state;
+- snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
+- atomic_inc(&check_state->started);
++ check_state.infos[i].result = 0;
++ check_state.infos[i].state = &check_state;
+
+- check_state->infos[i].thread =
++ check_state.infos[i].thread =
+ kthread_run(bch_btree_check_thread,
+- &check_state->infos[i],
+- name);
+- if (IS_ERR(check_state->infos[i].thread)) {
++ &check_state.infos[i],
++ "bch_btrchk[%d]", i);
++ if (IS_ERR(check_state.infos[i].thread)) {
+ pr_err("fails to run thread bch_btrchk[%d]\n", i);
+ for (--i; i >= 0; i--)
+- kthread_stop(check_state->infos[i].thread);
++ kthread_stop(check_state.infos[i].thread);
+ ret = -ENOMEM;
+ goto out;
+ }
++ atomic_inc(&check_state.started);
+ }
+
+ /*
+ * Must wait for all threads to stop.
+ */
+- wait_event_interruptible(check_state->wait,
+- atomic_read(&check_state->started) == 0);
++ wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
+
+- for (i = 0; i < check_state->total_threads; i++) {
+- if (check_state->infos[i].result) {
+- ret = check_state->infos[i].result;
++ for (i = 0; i < check_state.total_threads; i++) {
++ if (check_state.infos[i].result) {
++ ret = check_state.infos[i].result;
+ goto out;
+ }
+ }
+
+ out:
+- kfree(check_state);
++ rw_unlock(0, c->root);
+ return ret;
+ }
+
+--
+2.35.3
+
diff --git a/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch
new file mode 100644
index 0000000..2a05768
--- /dev/null
+++ b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch
@@ -0,0 +1,132 @@
+From 7ff9ba24404e797a53fd44ae4c21b2234d46ca39 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 14:14:17 +0800
+Subject: [PATCH 2/4] bcache: improve multithreaded bch_sectors_dirty_init()
+
+Commit b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be
+multithreaded") makes bch_sectors_dirty_init() to be much faster
+when counting dirty sectors by iterating all dirty keys in the btree.
+But it isn't in ideal shape yet, still can be improved.
+
+This patch does the following changes to improve current parallel dirty
+keys iteration on the btree,
+- Add read lock to root node when multiple threads iterating the btree,
+ to prevent the root node gets split by I/Os from other registered
+ bcache devices.
+- Remove local variable "char name[32]" and generate kernel thread name
+ string directly when calling kthread_run().
+- Allocate "struct bch_dirty_init_state state" directly on stack and
+ avoid the unnecessary dynamic memory allocation for it.
+- Increase &state->started to count created kernel thread after it
+ succeeds to create.
+- When wait for all dirty key counting threads to finish, use
+ wait_event() to replace wait_event_interruptible().
+
+With the above changes, the code is more clear, and some potential error
+conditions are avoided.
+
+Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/writeback.c | 62 ++++++++++++++---------------------
+ 1 file changed, 25 insertions(+), 37 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 9ee0005874cd..d24c09490f8e 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -948,10 +948,10 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ struct btree_iter iter;
+ struct sectors_dirty_init op;
+ struct cache_set *c = d->c;
+- struct bch_dirty_init_state *state;
+- char name[32];
++ struct bch_dirty_init_state state;
+
+ /* Just count root keys if no leaf node */
++ rw_lock(0, c->root, c->root->level);
+ if (c->root->level == 0) {
+ bch_btree_op_init(&op.op, -1);
+ op.inode = d->id;
+@@ -961,54 +961,42 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ for_each_key_filter(&c->root->keys,
+ k, &iter, bch_ptr_invalid)
+ sectors_dirty_init_fn(&op.op, c->root, k);
++ rw_unlock(0, c->root);
+ return;
+ }
+
+- state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL);
+- if (!state) {
+- pr_warn("sectors dirty init failed: cannot allocate memory\n");
+- return;
+- }
+-
+- state->c = c;
+- state->d = d;
+- state->total_threads = bch_btre_dirty_init_thread_nr();
+- state->key_idx = 0;
+- spin_lock_init(&state->idx_lock);
+- atomic_set(&state->started, 0);
+- atomic_set(&state->enough, 0);
+- init_waitqueue_head(&state->wait);
+-
+- for (i = 0; i < state->total_threads; i++) {
+- /* Fetch latest state->enough earlier */
++ state.c = c;
++ state.d = d;
++ state.total_threads = bch_btre_dirty_init_thread_nr();
++ state.key_idx = 0;
++ spin_lock_init(&state.idx_lock);
++ atomic_set(&state.started, 0);
++ atomic_set(&state.enough, 0);
++ init_waitqueue_head(&state.wait);
++
++ for (i = 0; i < state.total_threads; i++) {
++ /* Fetch latest state.enough earlier */
+ smp_mb__before_atomic();
+- if (atomic_read(&state->enough))
++ if (atomic_read(&state.enough))
+ break;
+
+- state->infos[i].state = state;
+- atomic_inc(&state->started);
+- snprintf(name, sizeof(name), "bch_dirty_init[%d]", i);
+-
+- state->infos[i].thread =
+- kthread_run(bch_dirty_init_thread,
+- &state->infos[i],
+- name);
+- if (IS_ERR(state->infos[i].thread)) {
++ state.infos[i].state = &state;
++ state.infos[i].thread =
++ kthread_run(bch_dirty_init_thread, &state.infos[i],
++ "bch_dirtcnt[%d]", i);
++ if (IS_ERR(state.infos[i].thread)) {
+ pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ for (--i; i >= 0; i--)
+- kthread_stop(state->infos[i].thread);
++ kthread_stop(state.infos[i].thread);
+ goto out;
+ }
++ atomic_inc(&state.started);
+ }
+
+- /*
+- * Must wait for all threads to stop.
+- */
+- wait_event_interruptible(state->wait,
+- atomic_read(&state->started) == 0);
+-
+ out:
+- kfree(state);
++ /* Must wait for all threads to stop. */
++ wait_event(state.wait, atomic_read(&state.started) == 0);
++ rw_unlock(0, c->root);
+ }
+
+ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+--
+2.35.3
+
diff --git a/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch
new file mode 100644
index 0000000..b11b7d4
--- /dev/null
+++ b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch
@@ -0,0 +1,138 @@
+From 8ffcbccd25f7f3edd157e9e2aa78e9b158bebb9b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 14:46:03 +0800
+Subject: [PATCH 3/4] bcache: remove incremental dirty sector counting for
+ bch_sectors_dirty_init()
+
+After making bch_sectors_dirty_init() being multithreaded, the existing
+incremental dirty sector counting in bch_root_node_dirty_init() doesn't
+release btree occupation after iterating 500000 (INIT_KEYS_EACH_TIME)
+bkeys. Because a read lock is added on btree root node to prevent the
+btree to be split during the dirty sectors counting, other I/O requester
+has no chance to gain the write lock even restart bcache_btree().
+
+That is to say, the incremental dirty sectors counting is incompatible
+to the multhreaded bch_sectors_dirty_init(). We have to choose one and
+drop another one.
+
+In my testing, with 512 bytes random writes, I generate 1.2T dirty data
+and a btree with 400K nodes. With single thread and incremental dirty
+sectors counting, it takes 30+ minites to register the backing device.
+And with multithreaded dirty sectors counting, the backing device
+registration can be accomplished within 2 minutes.
+
+The 30+ minutes V.S. 2- minutes difference makes me decide to keep
+multithreaded bch_sectors_dirty_init() and drop the incremental dirty
+sectors counting. This is what this patch does.
+
+But INIT_KEYS_EACH_TIME is kept, in sectors_dirty_init_fn() the CPU
+will be released by cond_resched() after every INIT_KEYS_EACH_TIME keys
+iterated. This is to avoid the watchdog reports a bogus soft lockup
+warning.
+
+Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/writeback.c | 41 +++++++++++------------------------
+ 1 file changed, 13 insertions(+), 28 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index d24c09490f8e..75b71199800d 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -805,13 +805,11 @@ static int bch_writeback_thread(void *arg)
+
+ /* Init */
+ #define INIT_KEYS_EACH_TIME 500000
+-#define INIT_KEYS_SLEEP_MS 100
+
+ struct sectors_dirty_init {
+ struct btree_op op;
+ unsigned int inode;
+ size_t count;
+- struct bkey start;
+ };
+
+ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+@@ -827,11 +825,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+ KEY_START(k), KEY_SIZE(k));
+
+ op->count++;
+- if (atomic_read(&b->c->search_inflight) &&
+- !(op->count % INIT_KEYS_EACH_TIME)) {
+- bkey_copy_key(&op->start, k);
+- return -EAGAIN;
+- }
++ if (!(op->count % INIT_KEYS_EACH_TIME))
++ cond_resched();
+
+ return MAP_CONTINUE;
+ }
+@@ -846,24 +841,16 @@ static int bch_root_node_dirty_init(struct cache_set *c,
+ bch_btree_op_init(&op.op, -1);
+ op.inode = d->id;
+ op.count = 0;
+- op.start = KEY(op.inode, 0, 0);
+-
+- do {
+- ret = bcache_btree(map_keys_recurse,
+- k,
+- c->root,
+- &op.op,
+- &op.start,
+- sectors_dirty_init_fn,
+- 0);
+- if (ret == -EAGAIN)
+- schedule_timeout_interruptible(
+- msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+- else if (ret < 0) {
+- pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+- break;
+- }
+- } while (ret == -EAGAIN);
++
++ ret = bcache_btree(map_keys_recurse,
++ k,
++ c->root,
++ &op.op,
++ &KEY(op.inode, 0, 0),
++ sectors_dirty_init_fn,
++ 0);
++ if (ret < 0)
++ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+
+ return ret;
+ }
+@@ -907,7 +894,6 @@ static int bch_dirty_init_thread(void *arg)
+ goto out;
+ }
+ skip_nr--;
+- cond_resched();
+ }
+
+ if (p) {
+@@ -917,7 +903,6 @@ static int bch_dirty_init_thread(void *arg)
+
+ p = NULL;
+ prev_idx = cur_idx;
+- cond_resched();
+ }
+
+ out:
+@@ -956,11 +941,11 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ bch_btree_op_init(&op.op, -1);
+ op.inode = d->id;
+ op.count = 0;
+- op.start = KEY(op.inode, 0, 0);
+
+ for_each_key_filter(&c->root->keys,
+ k, &iter, bch_ptr_invalid)
+ sectors_dirty_init_fn(&op.op, c->root, k);
++
+ rw_unlock(0, c->root);
+ return;
+ }
+--
+2.35.3
+
diff --git a/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch
new file mode 100644
index 0000000..aabe732
--- /dev/null
+++ b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch
@@ -0,0 +1,148 @@
+From 27029e1e8f064bc8541308c807d3ee579d86811d Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 22:55:46 +0800
+Subject: [PATCH 4/4] bcache: avoid journal no-space deadlock by reserving 1
+ journal bucket
+
+The journal no-space deadlock was reported time to time. Such deadlock
+can happen in the following situation.
+
+When all journal buckets are fully filled by active jset with heavy
+write I/O load, the cache set registration (after a reboot) will load
+all active jsets and inserting them into the btree again (which is
+called journal replay). If a journaled bkey is inserted into a btree
+node and results btree node split, new journal request might be
+triggered. For example, the btree grows one more level after the node
+split, then the root node record in cache device super block will be
+upgrade by bch_journal_meta() from bch_btree_set_root(). But there is no
+space in journal buckets, the journal replay has to wait for new journal
+bucket to be reclaimed after at least one journal bucket replayed. This
+is one example that how the journal no-space deadlock happens.
+
+The solution to avoid the deadlock is to reserve 1 journal bucket in
+run time, and only permit the reserved journal bucket to be used during
+cache set registration procedure for things like journal replay. Then
+the journal space will never be fully filled, there is no chance for
+journal no-space deadlock to happen anymore.
+
+This patch adds a new member "bool do_reserve" in struct journal, it is
+inititalized to 0 (false) when struct journal is allocated, and set to
+1 (true) by bch_journal_space_reserve() when all initialization done in
+run_cache_set(). In the run time when journal_reclaim() tries to
+allocate a new journal bucket, free_journal_buckets() is called to check
+whether there are enough free journal buckets to use. If there is only
+1 free journal bucket and journal->do_reserve is 1 (true), the last
+bucket is reserved and free_journal_buckets() will return 0 to indicate
+no free journal bucket. Then journal_reclaim() will give up, and try
+next time to see whetheer there is free journal bucket to allocate. By
+this method, there is always 1 jouranl bucket reserved in run time.
+
+During the cache set registration, journal->do_reserve is 0 (false), so
+the reserved journal bucket can be used to avoid the no-space deadlock.
+
+Reported-by: Nikhil Kshirsagar <nkshirsagar@gmail.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/journal.c | 31 ++++++++++++++++++++++++++-----
+ drivers/md/bcache/journal.h | 2 ++
+ drivers/md/bcache/super.c | 1 +
+ 3 files changed, 29 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index df5347ea450b..e5da469a4235 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -405,6 +405,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ return ret;
+ }
+
++void bch_journal_space_reserve(struct journal *j)
++{
++ j->do_reserve = true;
++}
++
+ /* Journalling */
+
+ static void btree_flush_write(struct cache_set *c)
+@@ -621,12 +626,30 @@ static void do_journal_discard(struct cache *ca)
+ }
+ }
+
++static unsigned int free_journal_buckets(struct cache_set *c)
++{
++ struct journal *j = &c->journal;
++ struct cache *ca = c->cache;
++ struct journal_device *ja = &c->cache->journal;
++ unsigned int n;
++
++ /* In case njournal_buckets is not power of 2 */
++ if (ja->cur_idx >= ja->discard_idx)
++ n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx;
++ else
++ n = ja->discard_idx - ja->cur_idx;
++
++ if (n > (1 + j->do_reserve))
++ return n - (1 + j->do_reserve);
++
++ return 0;
++}
++
+ static void journal_reclaim(struct cache_set *c)
+ {
+ struct bkey *k = &c->journal.key;
+ struct cache *ca = c->cache;
+ uint64_t last_seq;
+- unsigned int next;
+ struct journal_device *ja = &ca->journal;
+ atomic_t p __maybe_unused;
+
+@@ -649,12 +672,10 @@ static void journal_reclaim(struct cache_set *c)
+ if (c->journal.blocks_free)
+ goto out;
+
+- next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+- /* No space available on this device */
+- if (next == ja->discard_idx)
++ if (!free_journal_buckets(c))
+ goto out;
+
+- ja->cur_idx = next;
++ ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ k->ptr[0] = MAKE_PTR(0,
+ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+ ca->sb.nr_this_dev);
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..cd316b4a1e95 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -105,6 +105,7 @@ struct journal {
+ spinlock_t lock;
+ spinlock_t flush_write_lock;
+ bool btree_flushing;
++ bool do_reserve;
+ /* used when waiting because the journal was full */
+ struct closure_waitlist wait;
+ struct closure io;
+@@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
++void bch_journal_space_reserve(struct journal *j);
+
+ #endif /* _BCACHE_JOURNAL_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index bf3de149d3c9..2bb55278d22d 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2128,6 +2128,7 @@ static int run_cache_set(struct cache_set *c)
+
+ flash_devs_run(c);
+
++ bch_journal_space_reserve(&c->journal);
+ set_bit(CACHE_SET_RUNNING, &c->flags);
+ return 0;
+ err:
+--
+2.35.3
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..fba652d
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From d5ca176bc66727740baa4c80ba1349ba25dc95f7 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH 01/13] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+ This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+ This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+ This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+ This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's alloction record list which is headed by the above
+struct bch_nvmpg_head.
+
+- struct bch_nvmpg_recs
+ This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+ - Bits 0 - 51: pgoff - is pages offset of the allocated pages.
+ - Bits 52 - 57: order - allocaed size in page_size * order-of-2
+ - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+ - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special fomat.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..e9eb6371fd78
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ * This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ * This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ * This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ * This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's alloction record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ * This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ * | bch_nvmpg_header |
++ * | |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ * | bch_nvmpg_recs |
++ * | (nvm pages internal usage) |
++ * 24KB +---------------------------------+
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ * All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ * [highest 3 bits: ns_id]
++ * [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ * highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ * address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET 4096
++#define BCH_NVMPG_START (16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE 32
++#define BCH_NVMPG_NS_MAX 8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10)
++
++#define BCH_NVMPG_SB_VERSION 0
++#define BCH_NVMPG_SB_VERSION_MAX 0
++
++static const __u8 bch_nvmpg_magic[] = {
++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++ union {
++ struct {
++ __u64 pgoff:52;
++ __u64 order:6;
++ __u64 ns_id:3;
++ __u64 reserved:3;
++ };
++ __u64 _v;
++ };
++};
++
++struct bch_nvmpg_recs {
++ union {
++ struct {
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvmpg_head
++ */
++ __u64 head_offset;
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvm_pgalloc_recs which contains
++ * the next recs[] array.
++ */
++ __u64 next_offset;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_rec recs[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_RECS \
++ ((sizeof(struct bch_nvmpg_recs) - \
++ offsetof(struct bch_nvmpg_recs, recs)) / \
++ sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE 0x0
++#define BCH_NVMPG_HD_STAT_ALLOC 0x1
++struct bch_nvmpg_head {
++ __u8 uuid[16];
++ __u8 label[BCH_NVMPG_LBL_SIZE];
++ __u32 state;
++ __u32 flags;
++ /*
++ * Array of offset values from the nvmpg offset format
++ * pointers, each of the pointer points to a per-namespace
++ * struct bch_nvmpg_recs.
++ */
++ __u64 recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++ union {
++ struct {
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_head heads[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_HEADS \
++ ((sizeof(struct bch_nvmpg_set_header) - \
++ offsetof(struct bch_nvmpg_set_header, heads)) / \
++ sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++ __u64 csum;
++ __u64 sb_offset;
++ __u64 ns_start;
++ __u64 version;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 page_size;
++ __u32 total_ns;
++ __u32 this_ns;
++ union {
++ __u8 set_uuid[16];
++ __u64 set_magic;
++ };
++
++ __u64 flags;
++ __u64 seq;
++
++ __u64 feature_compat;
++ __u64 feature_incompat;
++ __u64 feature_ro_compat;
++
++ /* For allocable nvm pages from buddy systems */
++ __u64 pages_offset;
++ __u64 pages_total;
++
++ __u64 pad[8];
++
++ /*
++ * A nvmpg offset format pointer, it points
++ * to struct bch_nvmpg_set_header which is
++ * stored only on the first name space.
++ */
++ __u64 set_header_offset;
++
++ /* Just for csum_set() */
++ __u32 keys;
++ __u64 d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..485a6e0
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,543 @@
+From d0a096b054485476b6788ae2a071c036dcffc248 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH 02/13] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig | 10 ++
+ drivers/md/bcache/Makefile | 1 +
+ drivers/md/bcache/nvmpg.c | 341 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 97 +++++++++++
+ drivers/md/bcache/super.c | 3 +
+ 5 files changed, 452 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ device path into this file will returns immediately and the real
+ registration work is handled in kernel work queue in asynchronous
+ way.
++
++config BCACHE_NVM_PAGES
++ bool "NVDIMM support for bcache (EXPERIMENTAL)"
++ depends on BCACHE
++ depends on 64BIT
++ depends on LIBNVDIMM
++ depends on DAX
++ help
++ Allocate/release NV-memory pages for bcache and provide allocated pages
++ for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
+ bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..be006a91e8bb
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,341 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (offset == 0)
++ return NULL;
++
++ ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (ns)
++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++ pr_err("Invalid ns_id %u\n", ns_id);
++ return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++ int i;
++ struct bch_nvmpg_ns *ns;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ ns = set->ns_tbl[i];
++ if (ns) {
++ fs_put_dax(ns->dax_dev);
++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
++ set->ns_tbl[i] = NULL;
++ set->attached_ns--;
++ kfree(ns);
++ }
++ }
++
++ if (set->attached_ns)
++ pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++ release_ns_tbl(set);
++ kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_set_header *set_header;
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++ ns->ns_id);
++ return -EINVAL;
++ }
++
++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++ mutex_lock(&global_nvmpg_set->lock);
++ global_nvmpg_set->set_header = set_header;
++ global_nvmpg_set->heads_size = set_header->size;
++ global_nvmpg_set->heads_used = set_header->used;
++ mutex_unlock(&global_nvmpg_set->lock);
++
++ return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_sb *sb = ns->sb;
++ int rc = 0;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++ pr_err("ns_id %u already attached.\n", ns->ns_id);
++ rc = -EEXIST;
++ goto unlock;
++ }
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ if (global_nvmpg_set->attached_ns > 0) {
++ pr_err("multiple namespace attaching not supported yet\n");
++ rc = -EOPNOTSUPP;
++ goto unlock;
++ }
++
++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++ pr_err("namespace counters error: attached %u > total %u\n",
++ global_nvmpg_set->attached_ns,
++ global_nvmpg_set->total_ns);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++ global_nvmpg_set->attached_ns++;
++ global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++ struct bch_nvmpg_ns *ns)
++{
++ struct page *page;
++ struct bch_nvmpg_sb *sb;
++ uint64_t expected_csum = 0;
++ int r;
++
++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++ if (IS_ERR(page))
++ return -EIO;
++
++ sb = (struct bch_nvmpg_sb *)
++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++ r = -EINVAL;
++ expected_csum = csum_set(sb);
++ if (expected_csum != sb->csum) {
++ pr_info("csum is not match with expected one\n");
++ goto put_page;
++ }
++
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_info("invalid bch_nvmpg_magic\n");
++ goto put_page;
++ }
++
++ if (sb->sb_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++ goto put_page;
++ }
++
++ r = -EOPNOTSUPP;
++ if (sb->total_ns != 1) {
++ pr_info("multiple name space not supported yet.\n");
++ goto put_page;
++ }
++
++
++ r = 0;
++ /* Necessary for DAX mapping */
++ ns->page_size = sb->page_size;
++ ns->pages_total = sb->pages_total;
++
++put_page:
++ put_page(page);
++ return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ struct bch_nvmpg_ns *ns = NULL;
++ struct bch_nvmpg_sb *sb = NULL;
++ char buf[BDEVNAME_SIZE];
++ struct block_device *bdev;
++ pgoff_t pgoff;
++ int id, err;
++ char *path;
++ long dax_ret = 0;
++
++ path = kstrndup(dev_path, 512, GFP_KERNEL);
++ if (!path) {
++ pr_err("kstrndup failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ bdev = blkdev_get_by_path(strim(path),
++ FMODE_READ|FMODE_WRITE|FMODE_EXEC,
++ global_nvmpg_set);
++ if (IS_ERR(bdev)) {
++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++ kfree(path);
++ return ERR_PTR(PTR_ERR(bdev));
++ }
++
++ err = -ENOMEM;
++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++ if (!ns)
++ goto bdput;
++
++ err = -EIO;
++ if (read_nvdimm_meta_super(bdev, ns)) {
++ pr_err("%s read nvdimm meta super block failed.\n",
++ bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EOPNOTSUPP;
++ ns->dax_dev = fs_dax_get_by_bdev(bdev);
++ if (!ns->dax_dev) {
++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++ bdev_nr_sectors(bdev))) {
++ pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++ pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ id = dax_read_lock();
++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++ &ns->base_addr, &ns->start_pfn);
++ if (dax_ret <= 0) {
++ pr_err("dax_direct_access error\n");
++ dax_read_unlock(id);
++ goto free_ns;
++ }
++
++ if (dax_ret < ns->pages_total) {
++ pr_warn("mapped range %ld is less than ns->pages_total %lu\n",
++ dax_ret, ns->pages_total);
++ }
++ dax_read_unlock(id);
++
++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++ err = -EINVAL;
++ /* Check magic again to make sure DAX mapping is correct */
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++ goto free_ns;
++ }
++
++ if ((global_nvmpg_set->attached_ns > 0) &&
++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++ goto free_ns;
++ }
++
++ if (sb->set_header_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++ sb->this_ns,
++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++ goto free_ns;
++ }
++
++ ns->page_size = sb->page_size;
++ ns->pages_offset = sb->pages_offset;
++ ns->pages_total = sb->pages_total;
++ ns->sb = sb;
++ ns->free = 0;
++ ns->bdev = bdev;
++ ns->set = global_nvmpg_set;
++
++ err = attach_nvmpg_set(ns);
++ if (err < 0)
++ goto free_ns;
++
++ mutex_init(&ns->lock);
++
++ err = init_nvmpg_set_header(ns);
++ if (err < 0)
++ goto free_ns;
++
++ kfree(path);
++ return ns;
++
++free_ns:
++ fs_put_dax(ns->dax_dev);
++ kfree(ns);
++bdput:
++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
++ kfree(path);
++ return ERR_PTR(err);
++}
++EXPORT_SYMBOL_GPL(bch_register_namespace);
++
++int __init bch_nvmpg_init(void)
++{
++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++ if (!global_nvmpg_set)
++ return -ENOMEM;
++
++ global_nvmpg_set->total_ns = 0;
++ mutex_init(&global_nvmpg_set->lock);
++
++ pr_info("bcache nvm init\n");
++ return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++ release_nvmpg_set(global_nvmpg_set);
++ pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++ struct bch_nvmpg_sb *sb;
++ void *base_addr;
++
++ unsigned char uuid[16];
++ int ns_id;
++ unsigned int page_size;
++ unsigned long free;
++ unsigned long pages_offset;
++ unsigned long pages_total;
++ pfn_t start_pfn;
++
++ struct dax_device *dax_dev;
++ struct block_device *bdev;
++ struct bch_nvmpg_set *set;
++
++ struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++ unsigned char set_uuid[16];
++
++ int heads_size;
++ int heads_used;
++ struct bch_nvmpg_set_header *set_header;
++
++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++ int total_ns;
++ int attached_ns;
++
++ struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS 3
++#define BCH_NVMPG_OFFSET_BITS 61
++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset) \
++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset) \
++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \
++ ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++ return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index dc35f6e1d8d3..841d08e50191 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2811,6 +2812,7 @@ static void bcache_exit(void)
+ {
+ bch_debug_exit();
+ bch_request_exit();
++ bch_nvmpg_exit();
+ if (bcache_kobj)
+ kobject_put(bcache_kobj);
+ if (bcache_wq)
+@@ -2909,6 +2911,7 @@ static int __init bcache_init(void)
+
+ bch_debug_init();
+ closure_debug_init();
++ bch_nvmpg_init();
+
+ bcache_is_reboot = false;
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..395f285
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From c9977c3fd9e238ac5a8a684de5a8dc5c8a4462e2 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH 03/13] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+ struct { /* ZONE_DEVICE pages */
+ /** @pgmap: Points to the hosting device page map. */
+ struct dev_pagemap *pgmap;
+ void *zone_device_data;
+ /*
+ * ZONE_DEVICE private pages are counted as being
+ * mapped so the next 3 words hold the mapping, index,
+ * and private fields from the source anonymous or
+ * page cache page while the page is migrated to device
+ * private memory.
+ * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+ * use the mapping, index, and private fields when
+ * pmem backed DAX files are mapped.
+ */
+ };
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h | 12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index be006a91e8bb..b51073588f65 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++ return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++ return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ pgoff_t pgoff = r->pgoff;
++
++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff, u64 nr)
++{
++ while (nr > 0) {
++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++ bitmap_set(ns->pages_bitmap, pgoff, num);
++ nr -= num;
++ pgoff += num;
++ }
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ ns = set->ns_tbl[i];
+ if (ns) {
++ kvfree(ns->pages_bitmap);
++ if (ns->recs_bitmap)
++ bitmap_free(ns->recs_bitmap);
++
+ fs_put_dax(ns->dax_dev);
+ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+ set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ kfree(set);
+ }
+
++static int validate_recs(int ns_id,
++ struct bch_nvmpg_head *head,
++ struct bch_nvmpg_recs *recs)
++{
++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++ pr_err("Invalid bch_nvmpg_recs magic\n");
++ return -EINVAL;
++ }
++
++ if (memcmp(recs->uuid, head->uuid, 16)) {
++ pr_err("Invalid bch_nvmpg_recs uuid\n");
++ return -EINVAL;
++ }
++
++ if (recs->head_offset !=
++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++ pr_err("Invalid recs head_offset\n");
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++ int i, used = 0;
++
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *r = &recs->recs[i];
++ struct bch_nvmpg_ns *ns;
++ struct page *page;
++ void *addr;
++
++ if (r->pgoff == 0)
++ continue;
++
++ ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ addr = bch_nvmpg_rec_to_ptr(r);
++ if (addr < ns->base_addr) {
++ pr_err("Invalid recorded address\n");
++ return -EINVAL;
++ }
++
++ /* init struct page: index/private */
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, r->order);
++ page->index = r->pgoff;
++
++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++ used++;
++ }
++
++ if (used != recs->used) {
++ pr_err("used %d doesn't match recs->used %d\n",
++ used, recs->used);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_set_header *set_header;
++ struct bch_nvmpg_recs *sys_recs;
++ int i, j, used = 0, rc = 0;
+
+ if (ns->ns_id != 0) {
+ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ global_nvmpg_set->set_header = set_header;
+ global_nvmpg_set->heads_size = set_header->size;
+ global_nvmpg_set->heads_used = set_header->used;
++
++ /* Reserve the used space from buddy allocator */
++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *head;
++
++ head = &set_header->heads[i];
++ if (head->state == BCH_NVMPG_HD_STAT_FREE)
++ continue;
++
++ used++;
++ if (used > global_nvmpg_set->heads_size) {
++ pr_err("used heads %d > heads size %d.\n",
++ used, global_nvmpg_set->heads_size);
++ goto unlock;
++ }
++
++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++ /* Iterate the recs list */
++ while (recs) {
++ rc = validate_recs(j, head, recs);
++ if (rc < 0)
++ goto unlock;
++
++ rc = reserve_nvmpg_recs(recs);
++ if (rc < 0)
++ goto unlock;
++
++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++ }
++ }
++unlock:
+ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
+
+- return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++ unsigned int start, end, pages;
++ int i;
++ struct page *page;
++ pgoff_t pgoff_start;
++
++ bitmap_for_each_clear_region(ns->pages_bitmap,
++ start, end, 0, ns->pages_total) {
++ pgoff_start = start;
++ pages = end - start;
++
++ while (pages) {
++ void *addr;
++
++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++ if ((pgoff_start % (1L << i) == 0) &&
++ (pages >= (1L << i)))
++ break;
++ }
++
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, i);
++ page->index = pgoff_start;
++ __SetPageBuddy(page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[i]);
++
++ pgoff_start += 1L << i;
++ pages -= 1L << i;
++ }
++ }
+ }
+
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ char buf[BDEVNAME_SIZE];
+ struct block_device *bdev;
+ pgoff_t pgoff;
+- int id, err;
++ int id, i, err;
+ char *path;
+ long dax_ret = 0;
+
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+
+ mutex_init(&ns->lock);
+
++ /*
++ * parameters of bitmap_set/clear are unsigned int.
++ * Given currently size of nvm is far from exceeding this limit,
++ * so only add a WARN_ON message.
++ */
++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++ sizeof(unsigned long), GFP_KERNEL);
++ if (!ns->pages_bitmap) {
++ err = -ENOMEM;
++ goto clear_ns_nr;
++ }
++
++ if (ns->sb->this_ns == 0) {
++ ns->recs_bitmap =
++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++ if (ns->recs_bitmap == NULL) {
++ err = -ENOMEM;
++ goto free_pages_bitmap;
++ }
++ }
++
++ for (i = 0; i < BCH_MAX_ORDER; i++)
++ INIT_LIST_HEAD(&ns->free_area[i]);
++
+ err = init_nvmpg_set_header(ns);
+ if (err < 0)
+- goto free_ns;
++ goto free_recs_bitmap;
++
++ if (ns->sb->this_ns == 0)
++ /* init buddy allocator */
++ bch_nvmpg_init_free_space(ns);
+
+ kfree(path);
+ return ns;
+
++free_recs_bitmap:
++ bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++ kvfree(ns->pages_bitmap);
++clear_ns_nr:
++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ fs_put_dax(ns->dax_dev);
+ kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+ * Bcache NVDIMM in memory data structures
+ */
+
++#define BCH_MAX_ORDER 20
++
+ /*
+ * The following three structures in memory records which page(s) allocated
+ * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ unsigned long pages_total;
+ pfn_t start_pfn;
+
++ unsigned long *pages_bitmap;
++ struct list_head free_area[BCH_MAX_ORDER];
++
++ unsigned long *recs_bitmap;
++
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_MAX_PGALLOC_RECS \
++ (min_t(unsigned int, 64, \
++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
++ sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..9667099
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,309 @@
+From 8d0370253021430d3e59b084ce242a32410a51c0 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH 04/13] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 222 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 9 ++
+ 2 files changed, 231 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index b51073588f65..8c0e827a98cd 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ return NULL;
+ }
+
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = pgoff << PAGE_SHIFT;
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,214 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ }
+ }
+
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++ struct bch_nvmpg_head *head = NULL;
++ int i;
++
++ if (set_header == NULL)
++ goto out;
++
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *h = &set_header->heads[i];
++
++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++ continue;
++
++ if (!memcmp(uuid, h->uuid, 16)) {
++ head = h;
++ break;
++ }
++ }
++
++ if (!head && create) {
++ u32 used = set_header->used;
++
++ if (set_header->size > used) {
++ head = &set_header->heads[used];
++ memset(head, 0, sizeof(struct bch_nvmpg_head));
++ head->state = BCH_NVMPG_HD_STAT_ALLOC;
++ memcpy(head->uuid, uuid, 16);
++ global_nvmpg_set->heads_used++;
++ set_header->used++;
++ } else
++ pr_info("No free bch_nvmpg_head\n");
++ }
++
++out:
++ return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++ unsigned int start;
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++ struct bch_nvmpg_recs *recs;
++
++ start = bitmap_find_next_zero_area(ns->recs_bitmap,
++ BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++ if (start > BCH_MAX_PGALLOC_RECS) {
++ pr_info("No free struct bch_nvmpg_recs\n");
++ return NULL;
++ }
++
++ bitmap_set(ns->recs_bitmap, start, 1);
++ recs = (struct bch_nvmpg_recs *)
++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++ + start;
++
++ memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++ return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_head *head,
++ bool create)
++{
++ int ns_id = ns->sb->this_ns;
++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++ /* If create=false, we return recs[nr] */
++ if (!create)
++ return recs;
++
++ /*
++ * If create=true, it mean we need a empty struct bch_nvmpg_rec
++ * So we should find non-empty struct bch_nvmpg_recs or alloc
++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++ */
++ while (recs && (recs->used == recs->size)) {
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++
++ /* Found empty struct bch_nvmpg_recs */
++ if (recs)
++ return recs;
++
++ /* Need alloc new struct bch_nvmpg_recs */
++ recs = find_empty_nvmpg_recs();
++ if (recs) {
++ unsigned long offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++ memcpy(recs->uuid, head->uuid, 16);
++ recs->size = BCH_NVMPG_MAX_RECS;
++ recs->used = 0;
++
++ offset = bch_nvmpg_ptr_to_offset(ns, recs);
++ if (prev_recs)
++ prev_recs->next_offset = offset;
++ else
++ head->recs_offset[ns_id] = offset;
++ }
++
++ return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_recs *recs,
++ unsigned long nvmpg_offset,
++ int order)
++{
++ int i, ns_id;
++ unsigned long pgoff;
++
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++ ns_id = ns->sb->this_ns;
++
++ for (i = 0; i < recs->size; i++) {
++ if (recs->recs[i].pgoff == 0) {
++ recs->recs[i].pgoff = pgoff;
++ recs->recs[i].order = order;
++ recs->recs[i].ns_id = ns_id;
++ recs->used++;
++ break;
++ }
++ }
++ BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ unsigned long nvmpg_offset = 0;
++ struct bch_nvmpg_head *head;
++ int n, o;
++
++ mutex_lock(&global_nvmpg_set->lock);
++ head = find_nvmpg_head(uuid, true);
++
++ if (!head) {
++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++ goto unlock;
++ }
++
++ for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++ if (!ns || (ns->free < (1L << order)))
++ continue;
++
++ for (o = order; o < BCH_MAX_ORDER; o++) {
++ struct list_head *list;
++ struct page *page, *buddy_page;
++
++ if (list_empty(&ns->free_area[o]))
++ continue;
++
++ list = ns->free_area[o].next;
++ page = container_of((void *)list, struct page,
++ zone_device_data);
++
++ list_del(list);
++
++ while (o != order) {
++ void *addr;
++ pgoff_t pgoff;
++
++ pgoff = page->index + (1L << (o - 1));
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(buddy_page, o - 1);
++ buddy_page->index = pgoff;
++ __SetPageBuddy(buddy_page);
++ list_add((struct list_head *)&buddy_page->zone_device_data,
++ &ns->free_area[o - 1]);
++ o--;
++ }
++
++ set_page_private(page, order);
++ __ClearPageBuddy(page);
++ ns->free -= 1L << order;
++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++ break;
++ }
++
++ if (o < BCH_MAX_ORDER) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = find_nvmpg_recs(ns, head, true);
++ /* ToDo: handle pgalloc_recs==NULL */
++ add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++ break;
++ }
++ }
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return nvmpg_offset;
++}
++EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_PGOFF_TO_KVADDR(pgoff) \
++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS \
+ (min_t(unsigned int, 64, \
+ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+
+ #else
+
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+
+ static inline void bch_nvmpg_exit(void) { }
+
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
new file mode 100644
index 0000000..0f8454f
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
@@ -0,0 +1,252 @@
+From f0165caac63639c6bbc9bfa2182500ecebdb6bf9 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH 05/13] bcache: bch_nvmpg_free_pages() of the buddy allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 165 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h | 3 +
+ 2 files changed, 161 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 8c0e827a98cd..7b86f08c219a 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ return rc;
+ }
+
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++ int order)
++{
++ unsigned long add_pages = (1L << order);
++ pgoff_t pgoff;
++ struct page *page;
++ void *va;
++
++ if (nvmpg_offset == 0) {
++ pr_err("free pages on offset 0\n");
++ return;
++ }
++
++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++ WARN_ON((!page) || (page->private != order));
++ pgoff = page->index;
++
++ while (order < BCH_MAX_ORDER - 1) {
++ struct page *buddy_page;
++
++ pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++ pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++ break;
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!buddy_page);
++
++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++ list_del((struct list_head *)&buddy_page->zone_device_data);
++ __ClearPageBuddy(buddy_page);
++ pgoff = parent_pgoff;
++ order++;
++ continue;
++ }
++ break;
++ }
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[order]);
++ page->index = pgoff;
++ set_page_private(page, order);
++ __SetPageBuddy(page);
++ ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ pages = end - start;
+
+ while (pages) {
+- void *addr;
+-
+ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ if ((pgoff_start % (1L << i) == 0) &&
+ (pages >= (1L << i)))
+ break;
+ }
+
+- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+- page = bch_nvmpg_va_to_pg(addr);
++ page = bch_nvmpg_va_to_pg(
++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ set_page_private(page, i);
+ page->index = pgoff_start;
+- __SetPageBuddy(page);
+- list_add((struct list_head *)&page->zone_device_data,
+- &ns->free_area[i]);
++
++ /* In order to update ns->free */
++ __free_space(ns, pgoff_start, i);
+
+ pgoff_start += 1L << i;
+ pages -= 1L << i;
+@@ -491,6 +540,107 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ }
+ EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
+
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++ void *start_addr, void *end_addr)
++{
++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++ unsigned long nvmpg_offset, int order)
++{
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *prev_recs, *sys_recs;
++ struct bch_nvmpg_ns *ns;
++ unsigned long pgoff;
++ int i;
++
++ ns = global_nvmpg_set->ns_tbl[0];
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++ head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++ prev_recs = recs;
++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++ while (recs) {
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++ WARN_ON(rec->order != order);
++ rec->_v = 0;
++ recs->used--;
++
++ if (recs->used == 0) {
++ int recs_pos = recs - sys_recs;
++
++ if (recs == prev_recs)
++ head->recs_offset[ns_id] =
++ recs->next_offset;
++ else
++ prev_recs->next_offset =
++ recs->next_offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = 0;
++
++ bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++ }
++ goto out;
++ }
++ }
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++out:
++ return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++ const char *uuid)
++{
++ struct bch_nvmpg_ns *ns;
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *recs;
++ int r;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++ if (!ns) {
++ pr_err("can't find namespace by given kaddr from namespace\n");
++ goto unlock;
++ }
++
++ head = find_nvmpg_head(uuid, false);
++ if (!head) {
++ pr_err("can't found bch_nvmpg_head by uuid\n");
++ goto unlock;
++ }
++
++ recs = find_nvmpg_recs(ns, head, false);
++ if (!recs) {
++ pr_err("can't find bch_nvmpg_recs by uuid\n");
++ goto unlock;
++ }
++
++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++ if (r < 0) {
++ pr_err("can't find bch_nvmpg_rec\n");
++ goto unlock;
++ }
++
++ __free_space(ns, nvmpg_offset, order);
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++}
++EXPORT_SYMBOL_GPL(bch_nvmpg_free_pages);
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+@@ -687,6 +837,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ ns->pages_offset = sb->pages_offset;
+ ns->pages_total = sb->pages_total;
+ ns->sb = sb;
++ /* increase by __free_space() */
+ ns->free = 0;
+ ns->bdev = bdev;
+ ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+
+ #else
+
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return 0;
+ }
+
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
new file mode 100644
index 0000000..9195841
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
@@ -0,0 +1,67 @@
+From 10a097e1408174b0fe3f029c37d7d512662a4582 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH 06/13] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 6 ++++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 7b86f08c219a..e4642e591f23 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -540,6 +540,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ }
+ EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
+
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return find_nvmpg_head(uuid, false);
++}
++EXPORT_SYMBOL_GPL(bch_get_nvmpg_head);
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+
+ #else
+
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
new file mode 100644
index 0000000..f240531
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
@@ -0,0 +1,48 @@
+From 1faf072bef28470d4d90e6ec5c42981b4b881ec0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH 07/13] bcache: use bucket index to set GC_MARK_METADATA for
+ journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 93b67b8d31c3..f7f844c321c3 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ ca = c->cache;
+ ca->invalidate_needs_gc = 0;
+
+- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */
++ for (i = ca->sb.first_bucket;
++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+
+ for (k = ca->prio_buckets;
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
new file mode 100644
index 0000000..794e12a
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
@@ -0,0 +1,60 @@
+From 497259154b1f79bfdaf967b21109521b301af534 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH 08/13] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004
+
+ #define BCH_FEATURE_COMPAT_SUPP 0
+ #define BCH_FEATURE_RO_COMPAT_SUPP 0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++ BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
new file mode 100644
index 0000000..c8020e4
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
@@ -0,0 +1,255 @@
+From a0220c3b0138d021975ef1d5e29e07217626ff9e Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH 09/13] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+ list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+ from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h | 2 +-
+ drivers/md/bcache/nvmpg.c | 9 +++
+ drivers/md/bcache/nvmpg.h | 1 +
+ drivers/md/bcache/super.c | 18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+
+ #include <trace/events/bcache.h>
+
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+
+ return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++ struct cache *ca)
++{
++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++ unsigned long ret_offset = 0;
++ int i;
++
++ jnl_offset = (unsigned long)ca->sb.d[0];
++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ struct bch_nvmpg_recs *recs;
++ struct bch_nvmpg_rec *rec;
++ unsigned long recs_offset = 0;
++ int j;
++
++ recs_offset = nvmpg_head->recs_offset[i];
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ while (recs) {
++ for (j = 0; j < recs->size; j++) {
++ rec = &recs->recs[j];
++ if ((rec->pgoff != jnl_pgoff) ||
++ (rec->ns_id != jnl_ns_id))
++ continue;
++
++ ret_offset = jnl_offset;
++ goto out;
++ }
++ recs_offset = recs->next_offset;
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ }
++ }
++
++out:
++ return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++ struct bch_nvmpg_head *head = NULL;
++ unsigned long nvmpg_offset;
++ int order;
++
++ head = bch_get_nvmpg_head(ca->sb.set_uuid);
++ if (head) {
++ nvmpg_offset = find_journal_nvmpg_base(head, ca);
++ if (nvmpg_offset)
++ goto found;
++ }
++
++ order = ilog2((ca->sb.bucket_size *
++ ca->sb.njournal_buckets) / PAGE_SECTORS);
++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++ if (nvmpg_offset)
++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++ 0, (1 << order) * PAGE_SIZE);
++found:
++ return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++ int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ int i;
++ unsigned long jnl_base = 0;
++
++ jnl_base = get_journal_nvmpg_space(ca);
++ if (!jnl_base) {
++ pr_err("Failed to get journal space from nvdimm\n");
++ goto out;
++ }
++
++ /* Iniialized and reloaded from on-disk super block already */
++ if (ca->sb.d[0] != 0)
++ goto out;
++
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++ ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++ return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++ int i, ret = 0;
++ struct cache *ca = c->cache;
++
++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++ 2, SB_JOURNAL_BUCKETS);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = ca->sb.first_bucket + i;
++ } else
++ ret = __bch_journal_nvdimm_init(ca);
++
++ return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index e4642e591f23..142ad41e9c15 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+
+ struct bch_nvmpg_set *global_nvmpg_set;
+
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++ return global_nvmpg_set->ns_tbl[ns_id];
++
++ pr_emerg("Invalid ns_id: %d\n", ns_id);
++ return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+
+ #else
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 841d08e50191..990d5d6fe199 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device *
+ goto err;
+
+ err = "Journal buckets not sequential";
+- for (i = 0; i < sb->keys; i++)
+- if (sb->d[i] != sb->first_bucket + i)
+- goto err;
++ if (!bch_has_feature_nvdimm_meta(sb)) {
++ for (i = 0; i < sb->keys; i++)
++ if (sb->d[i] != sb->first_bucket + i)
++ goto err;
++ }
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2061,14 +2063,10 @@ static int run_cache_set(struct cache_set *c)
+ if (bch_journal_replay(c, &journal))
+ goto err;
+ } else {
+- unsigned int j;
+-
+ pr_notice("invalidating existing data\n");
+- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+- 2, SB_JOURNAL_BUCKETS);
+-
+- for (j = 0; j < ca->sb.keys; j++)
+- ca->sb.d[j] = ca->sb.first_bucket + j;
++ err = "error initializing journal";
++ if (bch_journal_init(c))
++ goto err;
+
+ bch_initial_gc_finish(c);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
new file mode 100644
index 0000000..6e105c6
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
@@ -0,0 +1,231 @@
+From a86e90383059c6d2a6972931127180b1fa174fbb Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH 10/13] bcache: support storing bcache journal into NVDIMM meta
+ device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+ The latency method to compose bio and issue the jset bio to cache
+ device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+ device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+ Use memory interface to access NVDIMM pages and store the jset by
+ memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+ address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c | 3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ return;
+ }
+
++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ switch (atomic_read(&ja->discard_in_flight)) {
+ case DISCARD_IN_FLIGHT:
+ return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ goto out;
+
+ ja->cur_idx = next;
+- k->ptr[0] = MAKE_PTR(0,
+- bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+- ca->sb.nr_this_dev);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ k->ptr[0] = MAKE_PTR(0,
++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++ ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++ ca->sb.d[ja->cur_idx]);
++#endif
++
+ atomic_long_inc(&c->reclaimed_journal_buckets);
+
+ bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+ }
+
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ __releases(c->journal.lock)
+ {
+- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+- struct cache *ca = c->cache;
+- struct journal_write *w = c->journal.cur;
+ struct bkey *k = &c->journal.key;
+- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+- ca->sb.block_size;
+-
++ struct journal_write *w = c->journal.cur;
++ struct closure *cl = &c->journal.io;
++ struct cache *ca = c->cache;
+ struct bio *bio;
+ struct bio_list list;
++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++ ca->sb.block_size;
+
+ bio_list_init(&list);
+
+- if (!w->need_write) {
+- closure_return_with_destructor(cl, journal_write_unlock);
+- return;
+- } else if (journal_full(&c->journal)) {
+- journal_reclaim(c);
+- spin_unlock(&c->journal.lock);
+-
+- btree_flush_write(c);
+- continue_at(cl, journal_write, bch_journal_wq);
+- return;
+- }
+-
+- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+- w->data->btree_level = c->root->level;
+-
+- bkey_copy(&w->data->btree_root, &c->root->key);
+- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+- w->data->magic = jset_magic(&ca->sb);
+- w->data->version = BCACHE_JSET_VERSION;
+- w->data->last_seq = last_seq(&c->journal);
+- w->data->csum = csum_set(w->data);
+-
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ ca = c->cache;
+ bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+
+ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ }
+-
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+
+ while ((bio = bio_list_pop(&list)))
+ closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++ __releases(c->journal.lock)
++{
++ struct journal_write *w = c->journal.cur;
++ struct cache *ca = c->cache;
++ unsigned int sectors;
++
++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++ atomic_long_add(sectors, &ca->meta_sectors_written);
++
++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++ c->journal.key.ptr[0] += sectors << 9;
++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++ atomic_dec_bug(&fifo_back(&c->journal.pin));
++ bch_journal_next(&c->journal);
++ journal_reclaim(c);
++
++ spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++ struct cache *ca = c->cache;
++ struct journal_write *w = c->journal.cur;
++
++ if (!w->need_write) {
++ closure_return_with_destructor(cl, journal_write_unlock);
++ return;
++ } else if (journal_full(&c->journal)) {
++ journal_reclaim(c);
++ spin_unlock(&c->journal.lock);
++
++ btree_flush_write(c);
++ continue_at(cl, journal_write, bch_journal_wq);
++ return;
++ }
++
++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++ w->data->btree_level = c->root->level;
++
++ bkey_copy(&w->data->btree_root, &c->root->key);
++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++ w->data->magic = jset_magic(&ca->sb);
++ w->data->version = BCACHE_JSET_VERSION;
++ w->data->last_seq = last_seq(&c->journal);
++ w->data->csum = csum_set(w->data);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ __journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ __journal_nvdimm_write_unlocked(c);
++#endif
+
+ continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 990d5d6fe199..42fd99406c60 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1672,7 +1672,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ struct cache_set *c = container_of(cl, struct cache_set, cl);
+- struct cache *ca;
++ struct cache *ca = c->cache;
+
+ debugfs_remove(c->debug);
+
+@@ -1684,7 +1684,6 @@ static void cache_set_free(struct closure *cl)
+ bch_bset_sort_state_free(&c->sort);
+ free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+
+- ca = c->cache;
+ if (ca) {
+ ca->set = NULL;
+ c->cache = NULL;
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
new file mode 100644
index 0000000..49ed5be
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
@@ -0,0 +1,181 @@
+From 29b95828f4804806bac44a795cba09ddc0cc0da0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH 11/13] bcache: read jset from NVDIMM pages for journal replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+ This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+ This is the method to read jset from NVDIMM memory interface, a.k.a
+ memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset,
++ struct closure *cl)
++{
++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++ struct bio *bio = &ca->journal.bio;
++ struct jset *data = ca->set->journal.w[0].data;
++
++ bio_reset(bio);
++ bio->bi_iter.bi_sector = bucket + offset;
++ bio_set_dev(bio, ca->bdev);
++ bio->bi_iter.bi_size = len << 9;
++
++ bio->bi_end_io = journal_read_endio;
++ bio->bi_private = cl;
++ bio_set_op_attrs(bio, REQ_OP_READ, 0);
++ bch_bio_map(bio, data);
++
++ closure_bio_submit(ca->set, bio, cl);
++ closure_sync(cl);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset)
++{
++ void *jset_addr;
++ struct jset *data;
++
++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++ data = ca->set->journal.w[0].data;
++
++ memcpy(data, jset_addr, len << 9);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ unsigned int bucket_index)
+ {
+ struct journal_device *ja = &ca->journal;
+- struct bio *bio = &ja->bio;
+
+ struct journal_replay *i;
+- struct jset *j, *data = ca->set->journal.w[0].data;
++ struct jset *j;
+ struct closure cl;
+ unsigned int len, left, offset = 0;
+ int ret = 0;
+- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+ closure_init_stack(&cl);
+
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread: left = ca->sb.bucket_size - offset;
+ len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+
+- bio_reset(bio);
+- bio->bi_iter.bi_sector = bucket + offset;
+- bio_set_dev(bio, ca->bdev);
+- bio->bi_iter.bi_size = len << 9;
+-
+- bio->bi_end_io = journal_read_endio;
+- bio->bi_private = &cl;
+- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+- bch_bio_map(bio, data);
+-
+- closure_bio_submit(ca->set, bio, &cl);
+- closure_sync(&cl);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++ /*
++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++ * support feature set, a cache device format with feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++ * read_super() by bch_has_unknown_incompat_features().
++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++ * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++ */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+
+ /* This function could be simpler now since we no longer write
+ * journal entries that overlap bucket boundaries; this means
+ * the start of a bucket will always have a valid journal entry
+ * if it has any journal entries at all.
+ */
+-
+- j = data;
+ while (len) {
+ struct list_head *where;
+ size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset;
+ return ret;
+ }
+
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b) \
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ unsigned int i, l, r, m;
+ uint64_t seq;
+
++ /*
++ * Linear addresses of NVDIMM pages for journaling is not
++ * initialized yet, do it before read jset from NVDIMM pages.
++ */
++ if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++ if (__bch_journal_nvdimm_init(ca) < 0)
++ return -ENXIO;
++ }
++
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
new file mode 100644
index 0000000..e35c696
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
@@ -0,0 +1,84 @@
+From 286f425617ba71c2ff30930d010e0808dc41d953 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH 12/13] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+ echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 42fd99406c60..723791250070 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2398,10 +2398,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++ struct kobj_attribute *attr,
++ const char *buffer, size_t size);
++#endif
+
+ kobj_attribute_write(register, register_bcache);
+ kobj_attribute_write(register_quiet, register_bcache);
+ kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2515,6 +2523,24 @@ static void register_device_async(struct async_reg_args *args)
+ queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++ const char *buffer, size_t size)
++{
++ ssize_t ret = size;
++
++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++ if (IS_ERR(ns)) {
++ pr_err("register nvdimm namespace %s for meta device failed.\n",
++ buffer);
++ ret = -EINVAL;
++ }
++
++ return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+ {
+@@ -2857,6 +2883,9 @@ static int __init bcache_init(void)
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ &ksysfs_register_nvdimm_meta.attr,
++#endif
+ &ksysfs_pendings_cleanup.attr,
+ NULL
+ };
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch
new file mode 100644
index 0000000..18fdf37
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch
@@ -0,0 +1,489 @@
+From b0344cea65a7c816dbad1d4684a96dca929d8344 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 22:54:20 +0800
+Subject: [PATCH 13/13] store btree node in nvdimm
+
+---
+ drivers/md/bcache/alloc.c | 67 +++++++++++++++---
+ drivers/md/bcache/bcache.h | 3 +-
+ drivers/md/bcache/bcache_ondisk.h | 2 +-
+ drivers/md/bcache/btree.c | 114 ++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.c | 50 +++++++++++++
+ drivers/md/bcache/nvmpg.h | 52 ++++++++++++++
+ drivers/md/bcache/super.c | 3 +-
+ 7 files changed, 273 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 097577ae3c47..9bdd6ee9e886 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -63,6 +63,7 @@
+
+ #include "bcache.h"
+ #include "btree.h"
++#include "nvmpg.h"
+
+ #include <linux/blkdev.h>
+ #include <linux/kthread.h>
+@@ -477,12 +478,28 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
+ }
+ }
+
++void __bch_nvmpg_bucket_free(struct cache_set *c, struct bkey *k)
++{
++ int order;
++ unsigned long nvmpg_offset;
++
++ order = ilog2(c->cache->sb.bucket_size / PAGE_SECTORS);
++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(k, 0));
++ bch_nvmpg_free_pages(nvmpg_offset, order, c->set_uuid);
++}
++
+ void bch_bucket_free(struct cache_set *c, struct bkey *k)
+ {
+ unsigned int i;
+
++ if (KEY_NVMPG(k)) {
++ __bch_nvmpg_bucket_free(c, k);
++ return;
++ }
++
+ for (i = 0; i < KEY_PTRS(k); i++)
+ __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
++ return;
+ }
+
+ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+@@ -517,15 +534,31 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+ return -1;
+ }
+
+-int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+- struct bkey *k, bool wait)
++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k)
+ {
+- int ret;
++ struct cache *ca;
++ unsigned long nvmpg_offset, bkey_offset;
++ int order;
+
+- mutex_lock(&c->bucket_lock);
+- ret = __bch_bucket_alloc_set(c, reserve, k, wait);
+- mutex_unlock(&c->bucket_lock);
+- return ret;
++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
++ return -1;
++
++ lockdep_assert_held(&c->bucket_lock);
++
++ order = ilog2(ca->sb.bucket_size / PAGE_SECTORS);
++ nvmpg_offset = bch_nvmpg_alloc_pages(order, c->set_uuid);
++ if (!nvmpg_offset)
++ goto err;
++
++ bkey_offset = nvmpg_offset_to_bkey_offset(nvmpg_offset);
++
++ bkey_init(k);
++ k->ptr[0] = MAKE_PTR(0, bkey_offset, ca->sb.nr_this_dev);
++
++ SET_KEY_PTRS(k, 1);
++ return 0;
++err:
++ return -1;
+ }
+
+ /* Sector allocator */
+@@ -537,6 +570,23 @@ struct open_bucket {
+ BKEY_PADDED(key);
+ };
+
++int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
++ struct bkey *k, bool wait, int bucket_type)
++{
++ int ret;
++
++ if (bucket_type == BCH_DATA_BUCKET) {
++ mutex_lock(&c->bucket_lock);
++ ret = __bch_bucket_alloc_set(c, reserve, k, wait);
++ mutex_unlock(&c->bucket_lock);
++ } else {
++ ret = __bch_nvmpg_bucket_alloc(c, k);
++ }
++
++ return ret;
++}
++
++
+ /*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we try to segregate flash
+@@ -631,7 +681,8 @@ bool bch_alloc_sectors(struct cache_set *c,
+
+ spin_unlock(&c->data_bucket_lock);
+
+- if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
++ if (bch_bucket_alloc_set(c, watermark, &alloc.key,
++ wait, BCH_DATA_BUCKET))
+ return false;
+
+ spin_lock(&c->data_bucket_lock);
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 9ed9c955add7..d54c3c3d8d7e 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -979,11 +979,12 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
+ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+ struct bkey *k, bool wait);
+ int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+- struct bkey *k, bool wait);
++ struct bkey *k, bool wait, int bucket_type);
+ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
+ unsigned int sectors, unsigned int write_point,
+ unsigned int write_prio, bool wait);
+ bool bch_cached_dev_error(struct cached_dev *dc);
++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k);
+
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
+diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
+index 97413586195b..6c890f632197 100644
+--- a/drivers/md/bcache/bcache_ondisk.h
++++ b/drivers/md/bcache/bcache_ondisk.h
+@@ -45,7 +45,7 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \
+ KEY_FIELD(KEY_PTRS, high, 60, 3)
+ KEY_FIELD(__PAD0, high, 58, 2)
+ KEY_FIELD(KEY_CSUM, high, 56, 2)
+-KEY_FIELD(__PAD1, high, 55, 1)
++KEY_FIELD(KEY_NVMPG, high, 55, 1)
+ KEY_FIELD(KEY_DIRTY, high, 36, 1)
+
+ KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS)
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index f7f844c321c3..b8854905b93e 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -25,6 +25,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "features.h"
++#include "nvmpg.h"
+
+ #include <linux/slab.h>
+ #include <linux/bitops.h>
+@@ -240,14 +242,12 @@ static void btree_node_read_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
+-static void bch_btree_node_read(struct btree *b)
++static void __bch_btree_node_read(struct btree *b)
+ {
+ uint64_t start_time = local_clock();
+ struct closure cl;
+ struct bio *bio;
+
+- trace_bcache_btree_read(b);
+-
+ closure_init_stack(&cl);
+
+ bio = bch_bbio_alloc(b->c);
+@@ -278,6 +278,35 @@ static void bch_btree_node_read(struct btree *b)
+ PTR_BUCKET_NR(b->c, &b->key, 0));
+ }
+
++static void __bch_nvmpg_btree_node_read(struct btree *b)
++{
++ uint64_t start_time = local_clock();
++ void *ptr;
++
++ ptr = bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0));
++ memcpy(b->keys.set[0].data, ptr, KEY_SIZE(&b->key) << 9);
++
++ if (btree_node_io_error(b))
++ goto err;
++
++ bch_btree_node_read_done(b);
++ bch_time_stats_update(&b->c->btree_read_time, start_time);
++
++err:
++ bch_cache_set_error(b->c,
++ "io error reading NVDIMM pages at 0x%p\n", ptr);
++}
++
++static void bch_btree_node_read(struct btree *b)
++{
++ trace_bcache_btree_read(b);
++
++ if (!KEY_NVMPG(&b->key))
++ __bch_btree_node_read(b);
++ else
++ __bch_nvmpg_btree_node_read(b);
++}
++
+ static void btree_complete_write(struct btree *b, struct btree_write *w)
+ {
+ if (w->prio_blocked &&
+@@ -335,7 +364,7 @@ static void btree_node_write_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
+-static void do_btree_node_write(struct btree *b)
++static void __do_btree_node_write(struct btree *b)
+ {
+ struct closure *cl = &b->io;
+ struct bset *i = btree_bset_last(b);
+@@ -400,6 +429,68 @@ static void do_btree_node_write(struct btree *b)
+ }
+ }
+
++static void btree_nvmpg_complete_write(struct btree *b, struct btree_write *w)
++{
++ atomic_sub(w->prio_blocked, &b->c->prio_blocked);
++
++ if (w->journal) {
++ atomic_dec_bug(w->journal);
++ __closure_wake_up(&b->c->journal.wait);
++ }
++
++ w->prio_blocked = 0;
++ w->journal = NULL;
++}
++
++static void btree_nvmpg_node_write_done(struct closure *cl)
++{
++ struct btree *b = container_of(cl, struct btree, io);
++ struct btree_write *w = btree_prev_write(b);
++
++ btree_nvmpg_complete_write(b, w);
++
++ if (btree_node_dirty(b))
++ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
++
++ closure_return_with_destructor(cl, btree_node_write_unlock);
++}
++
++static void __do_nvmpg_btree_node_write(struct btree *b)
++{
++ struct closure *cl = &b->io;
++ struct bset *i = btree_bset_last(b);
++ unsigned long nvmpg_offset;
++ void *nvmpg_ptr;
++
++ i->version = BCACHE_BSET_VERSION;
++ i->csum = btree_csum_set(b, i);
++
++ BUG_ON(b->bio);
++
++ /* Calculate location to write */
++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(&b->key, 0));
++ nvmpg_offset += roundup(set_bytes(i), block_bytes(b->c->cache));
++ nvmpg_ptr = bch_nvmpg_offset_to_ptr(nvmpg_offset);
++
++ memcpy_flushcache(nvmpg_ptr, i,
++ roundup(set_bytes(i), block_bytes(b->c->cache)) << 9);
++
++ /* Update b->key to the wriitten location */
++ SET_PTR_OFFSET(&b->key, 0,
++ nvmpg_offset_to_bkey_offset(nvmpg_offset));
++
++ closure_sync(cl);
++ continue_at_nobarrier(cl, btree_nvmpg_node_write_done, NULL);
++}
++
++static void do_btree_node_write(struct btree *b)
++{
++ if (!KEY_NVMPG(&b->key))
++ __do_btree_node_write(b);
++ else
++ __do_nvmpg_btree_node_write(b);
++}
++
+ void __bch_btree_node_write(struct btree *b, struct closure *parent)
+ {
+ struct bset *i = btree_bset_last(b);
+@@ -1094,10 +1185,19 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+
+ mutex_lock(&c->bucket_lock);
+ retry:
+- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
++ /*
++ * If nvdimm_meta feature is enabled, try to allocate btree
++ * node from NVDIMM pages and set KEY_NVMPG bit successfully.
++ */
++ if (bch_has_feature_nvdimm_meta(&(c->cache->sb)))
++ __bch_nvmpg_bucket_alloc(c, &k.key);
++
++ if (!KEY_NVMPG(&k.key) &&
++ __bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
+ goto err;
+
+- bkey_put(c, &k.key);
++ if (!KEY_NVMPG(&k.key))
++ bkey_put(c, &k.key);
+ SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+ b = mca_alloc(c, op, &k.key, level);
+@@ -1118,7 +1218,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ trace_bcache_btree_node_alloc(b);
+ return b;
+ err_free:
+- bch_bucket_free(c, &k.key);
++ bch_bucket_free(c, &k.key);
+ err:
+ mutex_unlock(&c->bucket_lock);
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 142ad41e9c15..12d67e535854 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -91,6 +91,56 @@ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
+ }
+
++static void bug_on_bkey_offset_limit(unsigned long sector)
++{
++ if (sector >= ((1UL << BCH_BKEY_OFFSET_BITS) - 1)) {
++ pr_err("Invalid NVDIMM offset: too large as 0x%lx\n",
++ sector);
++ pr_err("Such condition should never happen. Panic.\n");
++ BUG();
++ }
++}
++
++int bkey_offset_to_nvmpg_ns_id(unsigned long bkey_offset)
++{
++ return (bkey_offset >> BCH_BKEY_OFFSET_BITS) &
++ BCH_BKEY_OFFSET_NS_ID_MASK;
++}
++
++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset)
++{
++ int ns_id;
++ unsigned long offset;
++
++ ns_id = (bkey_offset >> BCH_BKEY_OFFSET_BITS) &
++ BCH_BKEY_OFFSET_NS_ID_MASK;
++
++ offset = (bkey_offset & BCH_BKEY_OFFSET_MASK) << 9;
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset)
++{
++ int ns_id;
++ unsigned long sector;
++
++ ns_id = BCH_NVMPG_GET_NS_ID(nvmpg_offset);
++ sector = BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> 9;
++ bug_on_bkey_offset_limit(sector);
++
++ return ((sector & BCH_BKEY_OFFSET_MASK) |
++ ((ns_id & BCH_BKEY_OFFSET_NS_ID_MASK) << BCH_BKEY_OFFSET_BITS));
++}
++
++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset)
++{
++ unsigned long nvmpg_offset;
++
++ nvmpg_offset = bkey_offset_to_nvmpg_offset(bkey_offset);
++ return bch_nvmpg_offset_to_ptr(nvmpg_offset);
++}
++
+ static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
+ pgoff_t pgoff, u64 nr)
+ {
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index f7b7177cced3..7f6d8e6f9dff 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -84,6 +84,21 @@ struct bch_nvmpg_set {
+ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
+ sizeof(struct bch_nvmpg_recs)))
+
++
++/* For bkey PTR_OFFSET to nvmpg namespace ID and offset convertion.
++ *
++ * PTR_OFFSET is 43 bits, the most significant 3 bits are for
++ * namespace ID. Rested 40 bits are for per-namespace offset
++ * in sectors.
++ */
++#define BCH_BKEY_OFFSET_NS_ID_BITS 3
++#define BCH_BKEY_OFFSET_NS_ID_MASK ((1UL<<BCH_BKEY_OFFSET_NS_ID_BITS) - 1)
++#define BCH_BKEY_OFFSET_BITS 40
++#define BCH_BKEY_OFFSET_MASK ((1UL<<BCH_BKEY_OFFSET_BITS) - 1)
++
++#define BCH_DATA_BUCKET 0
++#define BCH_META_BUCKET 1
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+@@ -96,6 +111,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+ struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset);
++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset);
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+ #else
+
+@@ -123,6 +144,37 @@ static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
+ return NULL;
+ }
+
++static inline void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset)
++{
++ return NULL;
++}
++
++static inline struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++ return NULL;
++}
++
++static inline unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset)
++{
++ return 0;
++}
++
++static inline unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset)
++{
++ return 0;
++}
++
++static inline void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++ return NULL;
++}
++
++static inline unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++ return 0;
++}
++
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 723791250070..64b517e8136a 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -512,7 +512,8 @@ static int __uuid_write(struct cache_set *c)
+ closure_init_stack(&cl);
+ lockdep_assert_held(&bch_register_lock);
+
+- if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
++ if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key,
++ true, BCH_META_BUCKET))
+ return 1;
+
+ size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..a3700f6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From bbb3b719dfc6070a5807bf6494f858e9e2f4f609 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH 01/12] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+ This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+ This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+ This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+ This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's allocation record list which is headed by the
+above struct bch_nvmpg_head.
+
+- struct bch_nvmpg_rec
+ This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+ - Bits 0 - 51: pgoff - is pages offset of the allocated pages.
+ - Bits 52 - 57: order - allocated size in page_size * order-of-2
+ - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+ - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special format.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..e9eb6371fd78
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ * This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ * This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ * This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ * This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's allocation record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ * This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ * | bch_nvmpg_header |
++ * | |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ * | bch_nvmpg_recs |
++ * | (nvm pages internal usage) |
++ * 24KB +---------------------------------+
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ * All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ * [highest 3 bits: ns_id]
++ * [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ * highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ * address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET 4096
++#define BCH_NVMPG_START (16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE 32
++#define BCH_NVMPG_NS_MAX 8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10)
++
++#define BCH_NVMPG_SB_VERSION 0
++#define BCH_NVMPG_SB_VERSION_MAX 0
++
++static const __u8 bch_nvmpg_magic[] = {
++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++ union {
++ struct {
++ __u64 pgoff:52;
++ __u64 order:6;
++ __u64 ns_id:3;
++ __u64 reserved:3;
++ };
++ __u64 _v;
++ };
++};
++
++struct bch_nvmpg_recs {
++ union {
++ struct {
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvmpg_head
++ */
++ __u64 head_offset;
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvm_pgalloc_recs which contains
++ * the next recs[] array.
++ */
++ __u64 next_offset;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_rec recs[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_RECS \
++ ((sizeof(struct bch_nvmpg_recs) - \
++ offsetof(struct bch_nvmpg_recs, recs)) / \
++ sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE 0x0
++#define BCH_NVMPG_HD_STAT_ALLOC 0x1
++struct bch_nvmpg_head {
++ __u8 uuid[16];
++ __u8 label[BCH_NVMPG_LBL_SIZE];
++ __u32 state;
++ __u32 flags;
++ /*
++ * Array of offset values from the nvmpg offset format
++ * pointers, each of the pointer points to a per-namespace
++ * struct bch_nvmpg_recs.
++ */
++ __u64 recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++ union {
++ struct {
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_head heads[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_HEADS \
++ ((sizeof(struct bch_nvmpg_set_header) - \
++ offsetof(struct bch_nvmpg_set_header, heads)) / \
++ sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++ __u64 csum;
++ __u64 sb_offset;
++ __u64 ns_start;
++ __u64 version;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 page_size;
++ __u32 total_ns;
++ __u32 this_ns;
++ union {
++ __u8 set_uuid[16];
++ __u64 set_magic;
++ };
++
++ __u64 flags;
++ __u64 seq;
++
++ __u64 feature_compat;
++ __u64 feature_incompat;
++ __u64 feature_ro_compat;
++
++ /* For allocable nvm pages from buddy systems */
++ __u64 pages_offset;
++ __u64 pages_total;
++
++ __u64 pad[8];
++
++ /*
++ * A nvmpg offset format pointer, it points
++ * to struct bch_nvmpg_set_header which is
++ * stored only on the first name space.
++ */
++ __u64 set_header_offset;
++
++ /* Just for csum_set() */
++ __u32 keys;
++ __u64 d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..ff4445c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,542 @@
+From a13fa68537fa67df106e366c0e1cd35d4e715feb Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH 02/12] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig | 10 ++
+ drivers/md/bcache/Makefile | 1 +
+ drivers/md/bcache/nvmpg.c | 340 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 97 +++++++++++
+ drivers/md/bcache/super.c | 3 +
+ 5 files changed, 451 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ device path into this file will returns immediately and the real
+ registration work is handled in kernel work queue in asynchronous
+ way.
++
++config BCACHE_NVM_PAGES
++ bool "NVDIMM support for bcache (EXPERIMENTAL)"
++ depends on BCACHE
++ depends on 64BIT
++ depends on LIBNVDIMM
++ depends on DAX
++ help
++ Allocate/release NV-memory pages for bcache and provide allocated pages
++ for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
+ bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..1dd321e4c280
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,340 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (offset == 0)
++ return NULL;
++
++ ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (ns)
++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++ pr_err("Invalid ns_id %u\n", ns_id);
++ return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++ int i;
++ struct bch_nvmpg_ns *ns;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ ns = set->ns_tbl[i];
++ if (ns) {
++ fs_put_dax(ns->dax_dev);
++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++ set->ns_tbl[i] = NULL;
++ set->attached_ns--;
++ kfree(ns);
++ }
++ }
++
++ if (set->attached_ns)
++ pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++ release_ns_tbl(set);
++ kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_set_header *set_header;
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++ ns->ns_id);
++ return -EINVAL;
++ }
++
++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++ mutex_lock(&global_nvmpg_set->lock);
++ global_nvmpg_set->set_header = set_header;
++ global_nvmpg_set->heads_size = set_header->size;
++ global_nvmpg_set->heads_used = set_header->used;
++ mutex_unlock(&global_nvmpg_set->lock);
++
++ return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_sb *sb = ns->sb;
++ int rc = 0;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++ pr_err("ns_id %u already attached.\n", ns->ns_id);
++ rc = -EEXIST;
++ goto unlock;
++ }
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ if (global_nvmpg_set->attached_ns > 0) {
++ pr_err("multiple namespace attaching not supported yet\n");
++ rc = -EOPNOTSUPP;
++ goto unlock;
++ }
++
++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++ pr_err("namespace counters error: attached %u > total %u\n",
++ global_nvmpg_set->attached_ns,
++ global_nvmpg_set->total_ns);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++ global_nvmpg_set->attached_ns++;
++ global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++ struct bch_nvmpg_ns *ns)
++{
++ struct page *page;
++ struct bch_nvmpg_sb *sb;
++ uint64_t expected_csum = 0;
++ int r;
++
++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++ if (IS_ERR(page))
++ return -EIO;
++
++ sb = (struct bch_nvmpg_sb *)
++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++ r = -EINVAL;
++ expected_csum = csum_set(sb);
++ if (expected_csum != sb->csum) {
++ pr_info("csum is not match with expected one\n");
++ goto put_page;
++ }
++
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_info("invalid bch_nvmpg_magic\n");
++ goto put_page;
++ }
++
++ if (sb->sb_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++ goto put_page;
++ }
++
++ r = -EOPNOTSUPP;
++ if (sb->total_ns != 1) {
++ pr_info("multiple name space not supported yet.\n");
++ goto put_page;
++ }
++
++
++ r = 0;
++ /* Necessary for DAX mapping */
++ ns->page_size = sb->page_size;
++ ns->pages_total = sb->pages_total;
++
++put_page:
++ put_page(page);
++ return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ struct bch_nvmpg_ns *ns = NULL;
++ struct bch_nvmpg_sb *sb = NULL;
++ char buf[BDEVNAME_SIZE];
++ struct block_device *bdev;
++ pgoff_t pgoff;
++ int id, err;
++ char *path;
++ long dax_ret = 0;
++
++ path = kstrndup(dev_path, 512, GFP_KERNEL);
++ if (!path) {
++ pr_err("kstrndup failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ bdev = blkdev_get_by_path(strim(path),
++ FMODE_READ|FMODE_WRITE|FMODE_EXCL,
++ global_nvmpg_set);
++ if (IS_ERR(bdev)) {
++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++ kfree(path);
++ return ERR_PTR(PTR_ERR(bdev));
++ }
++
++ err = -ENOMEM;
++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++ if (!ns)
++ goto bdput;
++
++ err = -EIO;
++ if (read_nvdimm_meta_super(bdev, ns)) {
++ pr_err("%s read nvdimm meta super block failed.\n",
++ bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EOPNOTSUPP;
++ ns->dax_dev = fs_dax_get_by_bdev(bdev);
++ if (!ns->dax_dev) {
++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++ bdev_nr_sectors(bdev))) {
++ pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++ pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ id = dax_read_lock();
++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++ &ns->base_addr, &ns->start_pfn);
++ if (dax_ret <= 0) {
++ pr_err("dax_direct_access error\n");
++ dax_read_unlock(id);
++ goto free_ns;
++ }
++
++ if (dax_ret < ns->pages_total) {
++ pr_warn("currently first %ld pages (from %lu in total) are used\n",
++ dax_ret, ns->pages_total);
++ }
++ dax_read_unlock(id);
++
++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++ err = -EINVAL;
++ /* Check magic again to make sure DAX mapping is correct */
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++ goto free_ns;
++ }
++
++ if ((global_nvmpg_set->attached_ns > 0) &&
++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++ goto free_ns;
++ }
++
++ if (sb->set_header_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++ sb->this_ns,
++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++ goto free_ns;
++ }
++
++ ns->page_size = sb->page_size;
++ ns->pages_offset = sb->pages_offset;
++ ns->pages_total = sb->pages_total;
++ ns->sb = sb;
++ ns->free = 0;
++ ns->bdev = bdev;
++ ns->set = global_nvmpg_set;
++
++ err = attach_nvmpg_set(ns);
++ if (err < 0)
++ goto free_ns;
++
++ mutex_init(&ns->lock);
++
++ err = init_nvmpg_set_header(ns);
++ if (err < 0)
++ goto free_ns;
++
++ kfree(path);
++ return ns;
++
++free_ns:
++ fs_put_dax(ns->dax_dev);
++ kfree(ns);
++bdput:
++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++ kfree(path);
++ return ERR_PTR(err);
++}
++
++int __init bch_nvmpg_init(void)
++{
++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++ if (!global_nvmpg_set)
++ return -ENOMEM;
++
++ global_nvmpg_set->total_ns = 0;
++ mutex_init(&global_nvmpg_set->lock);
++
++ pr_info("bcache nvm init\n");
++ return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++ release_nvmpg_set(global_nvmpg_set);
++ pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++ struct bch_nvmpg_sb *sb;
++ void *base_addr;
++
++ unsigned char uuid[16];
++ int ns_id;
++ unsigned int page_size;
++ unsigned long free;
++ unsigned long pages_offset;
++ unsigned long pages_total;
++ pfn_t start_pfn;
++
++ struct dax_device *dax_dev;
++ struct block_device *bdev;
++ struct bch_nvmpg_set *set;
++
++ struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++ unsigned char set_uuid[16];
++
++ int heads_size;
++ int heads_used;
++ struct bch_nvmpg_set_header *set_header;
++
++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++ int total_ns;
++ int attached_ns;
++
++ struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS 3
++#define BCH_NVMPG_OFFSET_BITS 61
++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset) \
++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset) \
++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \
++ ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++ return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 86b9e355c583..74d51a0b806f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2818,6 +2819,7 @@ static void bcache_exit(void)
+ {
+ bch_debug_exit();
+ bch_request_exit();
++ bch_nvmpg_exit();
+ if (bcache_kobj)
+ kobject_put(bcache_kobj);
+ if (bcache_wq)
+@@ -2916,6 +2918,7 @@ static int __init bcache_init(void)
+
+ bch_debug_init();
+ closure_debug_init();
++ bch_nvmpg_init();
+
+ bcache_is_reboot = false;
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..784b84b
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From eabc025702499684f588f362099f47998d0fde63 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH 03/12] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+ struct { /* ZONE_DEVICE pages */
+ /** @pgmap: Points to the hosting device page map. */
+ struct dev_pagemap *pgmap;
+ void *zone_device_data;
+ /*
+ * ZONE_DEVICE private pages are counted as being
+ * mapped so the next 3 words hold the mapping, index,
+ * and private fields from the source anonymous or
+ * page cache page while the page is migrated to device
+ * private memory.
+ * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+ * use the mapping, index, and private fields when
+ * pmem backed DAX files are mapped.
+ */
+ };
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h | 12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 1dd321e4c280..80e12e06f6d3 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++ return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++ return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ pgoff_t pgoff = r->pgoff;
++
++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff, u64 nr)
++{
++ while (nr > 0) {
++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++ bitmap_set(ns->pages_bitmap, pgoff, num);
++ nr -= num;
++ pgoff += num;
++ }
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ ns = set->ns_tbl[i];
+ if (ns) {
++ kvfree(ns->pages_bitmap);
++ if (ns->recs_bitmap)
++ bitmap_free(ns->recs_bitmap);
++
+ fs_put_dax(ns->dax_dev);
+ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ kfree(set);
+ }
+
++static int validate_recs(int ns_id,
++ struct bch_nvmpg_head *head,
++ struct bch_nvmpg_recs *recs)
++{
++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++ pr_err("Invalid bch_nvmpg_recs magic\n");
++ return -EINVAL;
++ }
++
++ if (memcmp(recs->uuid, head->uuid, 16)) {
++ pr_err("Invalid bch_nvmpg_recs uuid\n");
++ return -EINVAL;
++ }
++
++ if (recs->head_offset !=
++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++ pr_err("Invalid recs head_offset\n");
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++ int i, used = 0;
++
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *r = &recs->recs[i];
++ struct bch_nvmpg_ns *ns;
++ struct page *page;
++ void *addr;
++
++ if (r->pgoff == 0)
++ continue;
++
++ ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ addr = bch_nvmpg_rec_to_ptr(r);
++ if (addr < ns->base_addr) {
++ pr_err("Invalid recorded address\n");
++ return -EINVAL;
++ }
++
++ /* init struct page: index/private */
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, r->order);
++ page->index = r->pgoff;
++
++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++ used++;
++ }
++
++ if (used != recs->used) {
++ pr_err("used %d doesn't match recs->used %d\n",
++ used, recs->used);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_set_header *set_header;
++ struct bch_nvmpg_recs *sys_recs;
++ int i, j, used = 0, rc = 0;
+
+ if (ns->ns_id != 0) {
+ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ global_nvmpg_set->set_header = set_header;
+ global_nvmpg_set->heads_size = set_header->size;
+ global_nvmpg_set->heads_used = set_header->used;
++
++ /* Reserve the used space from buddy allocator */
++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *head;
++
++ head = &set_header->heads[i];
++ if (head->state == BCH_NVMPG_HD_STAT_FREE)
++ continue;
++
++ used++;
++ if (used > global_nvmpg_set->heads_size) {
++ pr_err("used heads %d > heads size %d.\n",
++ used, global_nvmpg_set->heads_size);
++ goto unlock;
++ }
++
++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++ /* Iterate the recs list */
++ while (recs) {
++ rc = validate_recs(j, head, recs);
++ if (rc < 0)
++ goto unlock;
++
++ rc = reserve_nvmpg_recs(recs);
++ if (rc < 0)
++ goto unlock;
++
++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++ }
++ }
++unlock:
+ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
+
+- return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++ unsigned int start, end, pages;
++ int i;
++ struct page *page;
++ pgoff_t pgoff_start;
++
++ bitmap_for_each_clear_region(ns->pages_bitmap,
++ start, end, 0, ns->pages_total) {
++ pgoff_start = start;
++ pages = end - start;
++
++ while (pages) {
++ void *addr;
++
++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++ if ((pgoff_start % (1L << i) == 0) &&
++ (pages >= (1L << i)))
++ break;
++ }
++
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, i);
++ page->index = pgoff_start;
++ __SetPageBuddy(page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[i]);
++
++ pgoff_start += 1L << i;
++ pages -= 1L << i;
++ }
++ }
+ }
+
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ char buf[BDEVNAME_SIZE];
+ struct block_device *bdev;
+ pgoff_t pgoff;
+- int id, err;
++ int id, i, err;
+ char *path;
+ long dax_ret = 0;
+
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+
+ mutex_init(&ns->lock);
+
++ /*
++ * parameters of bitmap_set/clear are unsigned int.
++ * Given currently size of nvm is far from exceeding this limit,
++ * so only add a WARN_ON message.
++ */
++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++ sizeof(unsigned long), GFP_KERNEL);
++ if (!ns->pages_bitmap) {
++ err = -ENOMEM;
++ goto clear_ns_nr;
++ }
++
++ if (ns->sb->this_ns == 0) {
++ ns->recs_bitmap =
++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++ if (ns->recs_bitmap == NULL) {
++ err = -ENOMEM;
++ goto free_pages_bitmap;
++ }
++ }
++
++ for (i = 0; i < BCH_MAX_ORDER; i++)
++ INIT_LIST_HEAD(&ns->free_area[i]);
++
+ err = init_nvmpg_set_header(ns);
+ if (err < 0)
+- goto free_ns;
++ goto free_recs_bitmap;
++
++ if (ns->sb->this_ns == 0)
++ /* init buddy allocator */
++ bch_nvmpg_init_free_space(ns);
+
+ kfree(path);
+ return ns;
+
++free_recs_bitmap:
++ bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++ kvfree(ns->pages_bitmap);
++clear_ns_nr:
++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ fs_put_dax(ns->dax_dev);
+ kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+ * Bcache NVDIMM in memory data structures
+ */
+
++#define BCH_MAX_ORDER 20
++
+ /*
+ * The following three structures in memory records which page(s) allocated
+ * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ unsigned long pages_total;
+ pfn_t start_pfn;
+
++ unsigned long *pages_bitmap;
++ struct list_head free_area[BCH_MAX_ORDER];
++
++ unsigned long *recs_bitmap;
++
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_MAX_PGALLOC_RECS \
++ (min_t(unsigned int, 64, \
++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
++ sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..94dc417
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,308 @@
+From badd2b9151913efdc34e68b532ca0e6360d5ba1b Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 9 ++
+ 2 files changed, 230 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 80e12e06f6d3..ca8ffcec9b2c 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ return NULL;
+ }
+
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = pgoff << PAGE_SHIFT;
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ }
+ }
+
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++ struct bch_nvmpg_head *head = NULL;
++ int i;
++
++ if (set_header == NULL)
++ goto out;
++
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *h = &set_header->heads[i];
++
++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++ continue;
++
++ if (!memcmp(uuid, h->uuid, 16)) {
++ head = h;
++ break;
++ }
++ }
++
++ if (!head && create) {
++ u32 used = set_header->used;
++
++ if (set_header->size > used) {
++ head = &set_header->heads[used];
++ memset(head, 0, sizeof(struct bch_nvmpg_head));
++ head->state = BCH_NVMPG_HD_STAT_ALLOC;
++ memcpy(head->uuid, uuid, 16);
++ global_nvmpg_set->heads_used++;
++ set_header->used++;
++ } else
++ pr_info("No free bch_nvmpg_head\n");
++ }
++
++out:
++ return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++ unsigned int start;
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++ struct bch_nvmpg_recs *recs;
++
++ start = bitmap_find_next_zero_area(ns->recs_bitmap,
++ BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++ if (start > BCH_MAX_PGALLOC_RECS) {
++ pr_info("No free struct bch_nvmpg_recs\n");
++ return NULL;
++ }
++
++ bitmap_set(ns->recs_bitmap, start, 1);
++ recs = (struct bch_nvmpg_recs *)
++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++ + start;
++
++ memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++ return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_head *head,
++ bool create)
++{
++ int ns_id = ns->sb->this_ns;
++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++ /* If create=false, we return recs[nr] */
++ if (!create)
++ return recs;
++
++ /*
++ * If create=true, it mean we need a empty struct bch_nvmpg_rec
++ * So we should find non-empty struct bch_nvmpg_recs or alloc
++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++ */
++ while (recs && (recs->used == recs->size)) {
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++
++ /* Found empty struct bch_nvmpg_recs */
++ if (recs)
++ return recs;
++
++ /* Need alloc new struct bch_nvmpg_recs */
++ recs = find_empty_nvmpg_recs();
++ if (recs) {
++ unsigned long offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++ memcpy(recs->uuid, head->uuid, 16);
++ recs->size = BCH_NVMPG_MAX_RECS;
++ recs->used = 0;
++
++ offset = bch_nvmpg_ptr_to_offset(ns, recs);
++ if (prev_recs)
++ prev_recs->next_offset = offset;
++ else
++ head->recs_offset[ns_id] = offset;
++ }
++
++ return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_recs *recs,
++ unsigned long nvmpg_offset,
++ int order)
++{
++ int i, ns_id;
++ unsigned long pgoff;
++
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++ ns_id = ns->sb->this_ns;
++
++ for (i = 0; i < recs->size; i++) {
++ if (recs->recs[i].pgoff == 0) {
++ recs->recs[i].pgoff = pgoff;
++ recs->recs[i].order = order;
++ recs->recs[i].ns_id = ns_id;
++ recs->used++;
++ break;
++ }
++ }
++ BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ unsigned long nvmpg_offset = 0;
++ struct bch_nvmpg_head *head;
++ int n, o;
++
++ mutex_lock(&global_nvmpg_set->lock);
++ head = find_nvmpg_head(uuid, true);
++
++ if (!head) {
++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++ goto unlock;
++ }
++
++ for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++ if (!ns || (ns->free < (1L << order)))
++ continue;
++
++ for (o = order; o < BCH_MAX_ORDER; o++) {
++ struct list_head *list;
++ struct page *page, *buddy_page;
++
++ if (list_empty(&ns->free_area[o]))
++ continue;
++
++ list = ns->free_area[o].next;
++ page = container_of((void *)list, struct page,
++ zone_device_data);
++
++ list_del(list);
++
++ while (o != order) {
++ void *addr;
++ pgoff_t pgoff;
++
++ pgoff = page->index + (1L << (o - 1));
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(buddy_page, o - 1);
++ buddy_page->index = pgoff;
++ __SetPageBuddy(buddy_page);
++ list_add((struct list_head *)&buddy_page->zone_device_data,
++ &ns->free_area[o - 1]);
++ o--;
++ }
++
++ set_page_private(page, order);
++ __ClearPageBuddy(page);
++ ns->free -= 1L << order;
++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++ break;
++ }
++
++ if (o < BCH_MAX_ORDER) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = find_nvmpg_recs(ns, head, true);
++ /* ToDo: handle pgalloc_recs==NULL */
++ add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++ break;
++ }
++ }
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return nvmpg_offset;
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_PGOFF_TO_KVADDR(pgoff) \
++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS \
+ (min_t(unsigned int, 64, \
+ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+
+ #else
+
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+
+ static inline void bch_nvmpg_exit(void) { }
+
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
new file mode 100644
index 0000000..4ac1234
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
@@ -0,0 +1,251 @@
+From 7eac3b1797acdd2ff3c684c9fabd7fe12bd671c6 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH 05/12] bcache: bch_nvmpg_free_pages() of the buddy allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h | 3 +
+ 2 files changed, 160 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index ca8ffcec9b2c..9864436a45cc 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ return rc;
+ }
+
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++ int order)
++{
++ unsigned long add_pages = (1L << order);
++ pgoff_t pgoff;
++ struct page *page;
++ void *va;
++
++ if (nvmpg_offset == 0) {
++ pr_err("free pages on offset 0\n");
++ return;
++ }
++
++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++ WARN_ON((!page) || (page->private != order));
++ pgoff = page->index;
++
++ while (order < BCH_MAX_ORDER - 1) {
++ struct page *buddy_page;
++
++ pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++ pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++ break;
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!buddy_page);
++
++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++ list_del((struct list_head *)&buddy_page->zone_device_data);
++ __ClearPageBuddy(buddy_page);
++ pgoff = parent_pgoff;
++ order++;
++ continue;
++ }
++ break;
++ }
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[order]);
++ page->index = pgoff;
++ set_page_private(page, order);
++ __SetPageBuddy(page);
++ ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ pages = end - start;
+
+ while (pages) {
+- void *addr;
+-
+ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ if ((pgoff_start % (1L << i) == 0) &&
+ (pages >= (1L << i)))
+ break;
+ }
+
+- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+- page = bch_nvmpg_va_to_pg(addr);
++ page = bch_nvmpg_va_to_pg(
++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ set_page_private(page, i);
+ page->index = pgoff_start;
+- __SetPageBuddy(page);
+- list_add((struct list_head *)&page->zone_device_data,
+- &ns->free_area[i]);
++
++ /* In order to update ns->free */
++ __free_space(ns, pgoff_start, i);
+
+ pgoff_start += 1L << i;
+ pages -= 1L << i;
+@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return nvmpg_offset;
+ }
+
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++ void *start_addr, void *end_addr)
++{
++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++ unsigned long nvmpg_offset, int order)
++{
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *prev_recs, *sys_recs;
++ struct bch_nvmpg_ns *ns;
++ unsigned long pgoff;
++ int i;
++
++ ns = global_nvmpg_set->ns_tbl[0];
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++ head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++ prev_recs = recs;
++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++ while (recs) {
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++ WARN_ON(rec->order != order);
++ rec->_v = 0;
++ recs->used--;
++
++ if (recs->used == 0) {
++ int recs_pos = recs - sys_recs;
++
++ if (recs == prev_recs)
++ head->recs_offset[ns_id] =
++ recs->next_offset;
++ else
++ prev_recs->next_offset =
++ recs->next_offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = 0;
++
++ bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++ }
++ goto out;
++ }
++ }
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++out:
++ return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++ const char *uuid)
++{
++ struct bch_nvmpg_ns *ns;
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *recs;
++ int r;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++ if (!ns) {
++ pr_err("can't find namespace by given kaddr from namespace\n");
++ goto unlock;
++ }
++
++ head = find_nvmpg_head(uuid, false);
++ if (!head) {
++ pr_err("can't found bch_nvmpg_head by uuid\n");
++ goto unlock;
++ }
++
++ recs = find_nvmpg_recs(ns, head, false);
++ if (!recs) {
++ pr_err("can't find bch_nvmpg_recs by uuid\n");
++ goto unlock;
++ }
++
++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++ if (r < 0) {
++ pr_err("can't find bch_nvmpg_rec\n");
++ goto unlock;
++ }
++
++ __free_space(ns, nvmpg_offset, order);
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ ns->pages_offset = sb->pages_offset;
+ ns->pages_total = sb->pages_total;
+ ns->sb = sb;
++ /* increase by __free_space() */
+ ns->free = 0;
+ ns->bdev = bdev;
+ ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+
+ #else
+
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return 0;
+ }
+
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
new file mode 100644
index 0000000..0a77f35
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
@@ -0,0 +1,66 @@
+From 3440789a920beb6e63493eecde279b6902ac0a1a Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH 06/12] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 5 +++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 9864436a45cc..3c50cb09bb7a 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return nvmpg_offset;
+ }
+
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return find_nvmpg_head(uuid, false);
++}
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+
+ #else
+
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
new file mode 100644
index 0000000..f2880af
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
@@ -0,0 +1,48 @@
+From 80d34e8aba0591ad58f1c3336333b48c715e3a69 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH 07/12] bcache: use bucket index to set GC_MARK_METADATA for
+ journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 88c573eeb598..1a0ff117373f 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ ca = c->cache;
+ ca->invalidate_needs_gc = 0;
+
+- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */
++ for (i = ca->sb.first_bucket;
++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+
+ for (k = ca->prio_buckets;
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
new file mode 100644
index 0000000..30de10c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
@@ -0,0 +1,60 @@
+From c006ab9655e4834a858bb399e1bcd8a51668d79c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004
+
+ #define BCH_FEATURE_COMPAT_SUPP 0
+ #define BCH_FEATURE_RO_COMPAT_SUPP 0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++ BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
new file mode 100644
index 0000000..a56c25c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
@@ -0,0 +1,255 @@
+From 09fdf9edf79edd718035e6d9afa75f80f1d3a330 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH 09/12] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+ list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+ from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h | 2 +-
+ drivers/md/bcache/nvmpg.c | 9 +++
+ drivers/md/bcache/nvmpg.h | 1 +
+ drivers/md/bcache/super.c | 18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+
+ #include <trace/events/bcache.h>
+
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+
+ return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++ struct cache *ca)
++{
++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++ unsigned long ret_offset = 0;
++ int i;
++
++ jnl_offset = (unsigned long)ca->sb.d[0];
++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ struct bch_nvmpg_recs *recs;
++ struct bch_nvmpg_rec *rec;
++ unsigned long recs_offset = 0;
++ int j;
++
++ recs_offset = nvmpg_head->recs_offset[i];
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ while (recs) {
++ for (j = 0; j < recs->size; j++) {
++ rec = &recs->recs[j];
++ if ((rec->pgoff != jnl_pgoff) ||
++ (rec->ns_id != jnl_ns_id))
++ continue;
++
++ ret_offset = jnl_offset;
++ goto out;
++ }
++ recs_offset = recs->next_offset;
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ }
++ }
++
++out:
++ return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++ struct bch_nvmpg_head *head = NULL;
++ unsigned long nvmpg_offset;
++ int order;
++
++ head = bch_get_nvmpg_head(ca->sb.set_uuid);
++ if (head) {
++ nvmpg_offset = find_journal_nvmpg_base(head, ca);
++ if (nvmpg_offset)
++ goto found;
++ }
++
++ order = ilog2((ca->sb.bucket_size *
++ ca->sb.njournal_buckets) / PAGE_SECTORS);
++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++ if (nvmpg_offset)
++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++ 0, (1 << order) * PAGE_SIZE);
++found:
++ return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++ int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ int i;
++ unsigned long jnl_base = 0;
++
++ jnl_base = get_journal_nvmpg_space(ca);
++ if (!jnl_base) {
++ pr_err("Failed to get journal space from nvdimm\n");
++ goto out;
++ }
++
++ /* Iniialized and reloaded from on-disk super block already */
++ if (ca->sb.d[0] != 0)
++ goto out;
++
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++ ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++ return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++ int i, ret = 0;
++ struct cache *ca = c->cache;
++
++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++ 2, SB_JOURNAL_BUCKETS);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = ca->sb.first_bucket + i;
++ } else
++ ret = __bch_journal_nvdimm_init(ca);
++
++ return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 3c50cb09bb7a..2d0808a83f86 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+
+ struct bch_nvmpg_set *global_nvmpg_set;
+
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++ return global_nvmpg_set->ns_tbl[ns_id];
++
++ pr_emerg("Invalid ns_id: %d\n", ns_id);
++ return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+
+ #else
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 74d51a0b806f..a27fa65d8832 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device *
+ goto err;
+
+ err = "Journal buckets not sequential";
+- for (i = 0; i < sb->keys; i++)
+- if (sb->d[i] != sb->first_bucket + i)
+- goto err;
++ if (!bch_has_feature_nvdimm_meta(sb)) {
++ for (i = 0; i < sb->keys; i++)
++ if (sb->d[i] != sb->first_bucket + i)
++ goto err;
++ }
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c)
+ if (bch_journal_replay(c, &journal))
+ goto err;
+ } else {
+- unsigned int j;
+-
+ pr_notice("invalidating existing data\n");
+- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+- 2, SB_JOURNAL_BUCKETS);
+-
+- for (j = 0; j < ca->sb.keys; j++)
+- ca->sb.d[j] = ca->sb.first_bucket + j;
++ err = "error initializing journal";
++ if (bch_journal_init(c))
++ goto err;
+
+ bch_initial_gc_finish(c);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
new file mode 100644
index 0000000..99e53f3
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
@@ -0,0 +1,231 @@
+From ab08690b14942f881d545539e83762a6fa794131 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH 10/12] bcache: support storing bcache journal into NVDIMM meta
+ device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+ The latency method to compose bio and issue the jset bio to cache
+ device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+ device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+ Use memory interface to access NVDIMM pages and store the jset by
+ memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+ address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c | 3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ return;
+ }
+
++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ switch (atomic_read(&ja->discard_in_flight)) {
+ case DISCARD_IN_FLIGHT:
+ return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ goto out;
+
+ ja->cur_idx = next;
+- k->ptr[0] = MAKE_PTR(0,
+- bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+- ca->sb.nr_this_dev);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ k->ptr[0] = MAKE_PTR(0,
++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++ ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++ ca->sb.d[ja->cur_idx]);
++#endif
++
+ atomic_long_inc(&c->reclaimed_journal_buckets);
+
+ bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+ }
+
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ __releases(c->journal.lock)
+ {
+- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+- struct cache *ca = c->cache;
+- struct journal_write *w = c->journal.cur;
+ struct bkey *k = &c->journal.key;
+- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+- ca->sb.block_size;
+-
++ struct journal_write *w = c->journal.cur;
++ struct closure *cl = &c->journal.io;
++ struct cache *ca = c->cache;
+ struct bio *bio;
+ struct bio_list list;
++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++ ca->sb.block_size;
+
+ bio_list_init(&list);
+
+- if (!w->need_write) {
+- closure_return_with_destructor(cl, journal_write_unlock);
+- return;
+- } else if (journal_full(&c->journal)) {
+- journal_reclaim(c);
+- spin_unlock(&c->journal.lock);
+-
+- btree_flush_write(c);
+- continue_at(cl, journal_write, bch_journal_wq);
+- return;
+- }
+-
+- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+- w->data->btree_level = c->root->level;
+-
+- bkey_copy(&w->data->btree_root, &c->root->key);
+- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+- w->data->magic = jset_magic(&ca->sb);
+- w->data->version = BCACHE_JSET_VERSION;
+- w->data->last_seq = last_seq(&c->journal);
+- w->data->csum = csum_set(w->data);
+-
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ ca = c->cache;
+ bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+
+ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ }
+-
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+
+ while ((bio = bio_list_pop(&list)))
+ closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++ __releases(c->journal.lock)
++{
++ struct journal_write *w = c->journal.cur;
++ struct cache *ca = c->cache;
++ unsigned int sectors;
++
++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++ atomic_long_add(sectors, &ca->meta_sectors_written);
++
++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++ c->journal.key.ptr[0] += sectors << 9;
++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++ atomic_dec_bug(&fifo_back(&c->journal.pin));
++ bch_journal_next(&c->journal);
++ journal_reclaim(c);
++
++ spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++ struct cache *ca = c->cache;
++ struct journal_write *w = c->journal.cur;
++
++ if (!w->need_write) {
++ closure_return_with_destructor(cl, journal_write_unlock);
++ return;
++ } else if (journal_full(&c->journal)) {
++ journal_reclaim(c);
++ spin_unlock(&c->journal.lock);
++
++ btree_flush_write(c);
++ continue_at(cl, journal_write, bch_journal_wq);
++ return;
++ }
++
++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++ w->data->btree_level = c->root->level;
++
++ bkey_copy(&w->data->btree_root, &c->root->key);
++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++ w->data->magic = jset_magic(&ca->sb);
++ w->data->version = BCACHE_JSET_VERSION;
++ w->data->last_seq = last_seq(&c->journal);
++ w->data->csum = csum_set(w->data);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ __journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ __journal_nvdimm_write_unlocked(c);
++#endif
+
+ continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a27fa65d8832..45b69ddc9cfa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ struct cache_set *c = container_of(cl, struct cache_set, cl);
+- struct cache *ca;
++ struct cache *ca = c->cache;
+
+ debugfs_remove(c->debug);
+
+@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl)
+ bch_bset_sort_state_free(&c->sort);
+ free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+
+- ca = c->cache;
+ if (ca) {
+ ca->set = NULL;
+ c->cache = NULL;
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
new file mode 100644
index 0000000..77a4ae4
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
@@ -0,0 +1,181 @@
+From 5b9accf31b16f6cc138754d8e77982092094a4ee Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH 11/12] bcache: read jset from NVDIMM pages for journal replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+ This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+ This is the method to read jset from NVDIMM memory interface, a.k.a
+ memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset,
++ struct closure *cl)
++{
++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++ struct bio *bio = &ca->journal.bio;
++ struct jset *data = ca->set->journal.w[0].data;
++
++ bio_reset(bio);
++ bio->bi_iter.bi_sector = bucket + offset;
++ bio_set_dev(bio, ca->bdev);
++ bio->bi_iter.bi_size = len << 9;
++
++ bio->bi_end_io = journal_read_endio;
++ bio->bi_private = cl;
++ bio_set_op_attrs(bio, REQ_OP_READ, 0);
++ bch_bio_map(bio, data);
++
++ closure_bio_submit(ca->set, bio, cl);
++ closure_sync(cl);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset)
++{
++ void *jset_addr;
++ struct jset *data;
++
++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++ data = ca->set->journal.w[0].data;
++
++ memcpy(data, jset_addr, len << 9);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ unsigned int bucket_index)
+ {
+ struct journal_device *ja = &ca->journal;
+- struct bio *bio = &ja->bio;
+
+ struct journal_replay *i;
+- struct jset *j, *data = ca->set->journal.w[0].data;
++ struct jset *j;
+ struct closure cl;
+ unsigned int len, left, offset = 0;
+ int ret = 0;
+- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+ closure_init_stack(&cl);
+
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread: left = ca->sb.bucket_size - offset;
+ len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+
+- bio_reset(bio);
+- bio->bi_iter.bi_sector = bucket + offset;
+- bio_set_dev(bio, ca->bdev);
+- bio->bi_iter.bi_size = len << 9;
+-
+- bio->bi_end_io = journal_read_endio;
+- bio->bi_private = &cl;
+- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+- bch_bio_map(bio, data);
+-
+- closure_bio_submit(ca->set, bio, &cl);
+- closure_sync(&cl);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++ /*
++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++ * support feature set, a cache device format with feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++ * read_super() by bch_has_unknown_incompat_features().
++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++ * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++ */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+
+ /* This function could be simpler now since we no longer write
+ * journal entries that overlap bucket boundaries; this means
+ * the start of a bucket will always have a valid journal entry
+ * if it has any journal entries at all.
+ */
+-
+- j = data;
+ while (len) {
+ struct list_head *where;
+ size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset;
+ return ret;
+ }
+
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b) \
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ unsigned int i, l, r, m;
+ uint64_t seq;
+
++ /*
++ * Linear addresses of NVDIMM pages for journaling is not
++ * initialized yet, do it before read jset from NVDIMM pages.
++ */
++ if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++ if (__bch_journal_nvdimm_init(ca) < 0)
++ return -ENXIO;
++ }
++
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
new file mode 100644
index 0000000..0ffc9a7
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
@@ -0,0 +1,84 @@
+From 55b8876f5fc3a3f097bca7f2b518e0dccd112905 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH 12/12] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+ echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 45b69ddc9cfa..2b9cde44879b 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++ struct kobj_attribute *attr,
++ const char *buffer, size_t size);
++#endif
+
+ kobj_attribute_write(register, register_bcache);
+ kobj_attribute_write(register_quiet, register_bcache);
+ kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args)
+ queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++ const char *buffer, size_t size)
++{
++ ssize_t ret = size;
++
++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++ if (IS_ERR(ns)) {
++ pr_err("register nvdimm namespace %s for meta device failed.\n",
++ buffer);
++ ret = -EINVAL;
++ }
++
++ return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+ {
+@@ -2864,6 +2890,9 @@ static int __init bcache_init(void)
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ &ksysfs_register_nvdimm_meta.attr,
++#endif
+ &ksysfs_pendings_cleanup.attr,
+ NULL
+ };
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch
new file mode 100644
index 0000000..fa696e6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch
@@ -0,0 +1,125 @@
+From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 12 Dec 2021 23:13:09 +0800
+Subject: [PATCH v13 00/12] bcache for 5.17: enable NVDIMM for bcache journal
+
+Hi Jens,
+
+This is the v12 effort the enabling NVDIMM for bcache journal, the code
+is under testing for months and quite stable now. Please consider to
+take them for Linux v5.17 merge window.
+
+All current code logic and on-media format are consistent with previous
+v12 series. The major difference from v12 series include,
+- more typos in code comments and commit logs are fixed.
+- add kernel message to indicate only first range is used currently if
+ the NVDIMM namespace has multiple mapping ranges.
+- not export nvm-pages allocator APIs, it is unnecessary since currently
+ only bcache uses them.
+
+Now all previous bcache related UAPI headers are all moved into bcache
+private code directory, there is no global headers exported to neither
+kernel or user source code.
+
+Bcache uses nvm-pages allocator to allocate pages from NVDIMM namespace
+for its journaling space. The nvm-pages allocator is a buddy-like
+allocator, which allocates size in power-of-2 pages from the NVDIMM
+namespace. User space tool 'bcache' has a new added '-M' option to
+format a NVDIMM namespace and register it via sysfs interface as a
+bcache meta device. The nvm-pages allocator code does a DAX mapping to
+map the whole namespace into system's memory address range, and allocate
+the pages to requestion like typical buddy allocator does. The major
+difference is nvm-pages allocator maintains the pages allocated to each
+requester by an allocation list which stored on NVDIMM too. Allocation
+list of different requester is tracked by a pre-defined UUID, all the
+pages tracked in all allocation lists are treated as allocated busy
+pages and won't be initialized into buddy system after the system
+reboots.
+
+The bcache journal code may request a block of power-of-2 size pages
+from the nvm-pages allocator, normally it is a range of 256MB or 512MB
+continuous pages range. During meta data journaling, the in-memory jsets
+go into the calculated nvdimm pages location by kernel memcpy routine.
+So the journaling I/Os won't go into block device (e.g. SSD) anymore,
+the write and read for journal jsets happen on NVDIMM.
+
+Intel developers Jianpeng Ma and Qiaowei Ren compose the initial code of
+nvm-pages allocator, the related patches are,
+- bcache: initialize the nvm-pages allocator
+- bcache: initialization of the buddy
+- bcache: bch_nvm_alloc_pages() of the buddy
+- bcache: bch_nvm_free_pages() of the buddy
+- bcache: get recs list head for allocated pages by specific uuid
+All the code depends on Linux libnvdimm and dax drivers, the bcache nvm-
+pages allocator can be treated as user of these two drivers.
+
+I modify the bcache code to recognize the nvm meta device feature,
+initialize journal on NVDIMM, and do journal I/Os on NVDIMM in the
+following patches,
+- bcache: add initial data structures for nvm pages
+- bcache: use bucket index to set GC_MARK_METADATA for journal buckets
+ in bch_btree_gc_finish()
+- bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set
+- bcache: initialize bcache journal for NVDIMM meta device
+- bcache: support storing bcache journal into NVDIMM meta device
+- bcache: read jset from NVDIMM pages for journal replay
+- bcache: add sysfs interface register_nvdimm_meta to register NVDIMM
+ meta device
+
+All the code is EXPERIMENTAL, they won't be enabled by default until we
+feel the NVDIMM support is completed and stable. The current code has
+been tested internally for monthes, we don't observe any issue during
+all tests with or without enabling the configuration.
+
+Please consider to pick this series for Linux v5.17 merge window. If
+there is any issue detected, we will response in time and fix them ASAP.
+
+Thank you in advance.
+
+Coly Li
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+
+Coly Li (7):
+ bcache: add initial data structures for nvm pages
+ bcache: use bucket index to set GC_MARK_METADATA for journal buckets
+ in bch_btree_gc_finish()
+ bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set
+ bcache: initialize bcache journal for NVDIMM meta device
+ bcache: support storing bcache journal into NVDIMM meta device
+ bcache: read jset from NVDIMM pages for journal replay
+ bcache: add sysfs interface register_nvdimm_meta to register NVDIMM
+ meta device
+
+Jianpeng Ma (5):
+ bcache: initialize the nvm pages allocator
+ bcache: initialization of the buddy
+ bcache: bch_nvmpg_alloc_pages() of the buddy
+ bcache: bch_nvmpg_free_pages() of the buddy allocator
+ bcache: get recs list head for allocated pages by specific uuid
+
+ drivers/md/bcache/Kconfig | 10 +
+ drivers/md/bcache/Makefile | 1 +
+ drivers/md/bcache/btree.c | 6 +-
+ drivers/md/bcache/features.h | 9 +
+ drivers/md/bcache/journal.c | 321 +++++++++--
+ drivers/md/bcache/journal.h | 2 +-
+ drivers/md/bcache/nvmpg.c | 931 +++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 128 +++++
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++
+ drivers/md/bcache/super.c | 53 +-
+ 10 files changed, 1646 insertions(+), 68 deletions(-)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..14b3695
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From 0ecd02239e1e7fc12115fda644810ee88bf26dff Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH v13 01/12] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+ This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+ This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+ This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+ This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's allocation record list which is headed by the
+above struct bch_nvmpg_head.
+
+- struct bch_nvmpg_rec
+ This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+ - Bits 0 - 51: pgoff - is pages offset of the allocated pages.
+ - Bits 52 - 57: order - allocated size in page_size * order-of-2
+ - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+ - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special format.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..414bcafa31ee
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ * This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ * This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ * This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ * This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's allocation record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ * This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ * | bch_nvmpg_header |
++ * | |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ * | bch_nvmpg_recs |
++ * | (nvm pages internal usage) |
++ * 24KB +---------------------------------+
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ * 0 +---------------------------------+
++ * | |
++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ * | bch_nvmpg_sb |
++ * 8KB +---------------------------------+
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * | |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ * | allocable nvm pages |
++ * | for buddy allocator |
++ * end +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ * All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ * [highest 3 bits: ns_id]
++ * [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ * highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ * address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET 4096
++#define BCH_NVMPG_START (16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE 32
++#define BCH_NVMPG_NS_MAX 8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10)
++
++#define BCH_NVMPG_SB_VERSION 0
++#define BCH_NVMPG_SB_VERSION_MAX 0
++
++static const __u8 bch_nvmpg_magic[] = {
++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++ union {
++ struct {
++ __u64 pgoff:52;
++ __u64 order:6;
++ __u64 ns_id:3;
++ __u64 reserved:3;
++ };
++ __u64 _v;
++ };
++};
++
++struct bch_nvmpg_recs {
++ union {
++ struct {
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvmpg_head
++ */
++ __u64 head_offset;
++ /*
++ * A nvmpg offset format pointer to
++ * struct bch_nvm_pgalloc_recs which contains
++ * the next recs[] array.
++ */
++ __u64 next_offset;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_rec recs[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_RECS \
++ ((sizeof(struct bch_nvmpg_recs) - \
++ offsetof(struct bch_nvmpg_recs, recs)) / \
++ sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE 0x0
++#define BCH_NVMPG_HD_STAT_ALLOC 0x1
++struct bch_nvmpg_head {
++ __u8 uuid[16];
++ __u8 label[BCH_NVMPG_LBL_SIZE];
++ __u32 state;
++ __u32 flags;
++ /*
++ * Array of offset values from the nvmpg offset format
++ * pointers, each of the pointer points to a per-namespace
++ * struct bch_nvmpg_recs.
++ */
++ __u64 recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++ union {
++ struct {
++ __u32 size;
++ __u32 used;
++ __u64 _pad[4];
++ struct bch_nvmpg_head heads[];
++ };
++ __u8 pad[8192];
++ };
++};
++
++#define BCH_NVMPG_MAX_HEADS \
++ ((sizeof(struct bch_nvmpg_set_header) - \
++ offsetof(struct bch_nvmpg_set_header, heads)) / \
++ sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++ __u64 csum;
++ __u64 sb_offset;
++ __u64 ns_start;
++ __u64 version;
++ __u8 magic[16];
++ __u8 uuid[16];
++ __u32 page_size;
++ __u32 total_ns;
++ __u32 this_ns;
++ union {
++ __u8 set_uuid[16];
++ __u64 set_magic;
++ };
++
++ __u64 flags;
++ __u64 seq;
++
++ __u64 feature_compat;
++ __u64 feature_incompat;
++ __u64 feature_ro_compat;
++
++ /* For allocable nvm pages from buddy systems */
++ __u64 pages_offset;
++ __u64 pages_total;
++
++ __u64 pad[8];
++
++ /*
++ * A nvmpg offset format pointer, it points
++ * to struct bch_nvmpg_set_header which is
++ * stored only on the first name space.
++ */
++ __u64 set_header_offset;
++
++ /* Just for csum_set() */
++ __u32 keys;
++ __u64 d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..54243a6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,542 @@
+From e75f8de4ca87db06507e173d795f42d1c98468d4 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH v13 02/12] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig | 10 ++
+ drivers/md/bcache/Makefile | 1 +
+ drivers/md/bcache/nvmpg.c | 340 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 97 +++++++++++
+ drivers/md/bcache/super.c | 3 +
+ 5 files changed, 451 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ device path into this file will returns immediately and the real
+ registration work is handled in kernel work queue in asynchronous
+ way.
++
++config BCACHE_NVM_PAGES
++ bool "NVDIMM support for bcache (EXPERIMENTAL)"
++ depends on BCACHE
++ depends on 64BIT
++ depends on LIBNVDIMM
++ depends on DAX
++ help
++ Allocate/release NV-memory pages for bcache and provide allocated pages
++ for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
+ bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..b654bbbda03e
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,340 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (offset == 0)
++ return NULL;
++
++ ns_id = BCH_NVMPG_GET_NS_ID(offset);
++ ns = global_nvmpg_set->ns_tbl[ns_id];
++
++ if (ns)
++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++ pr_err("Invalid ns_id %u\n", ns_id);
++ return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++ int i;
++ struct bch_nvmpg_ns *ns;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ ns = set->ns_tbl[i];
++ if (ns) {
++ fs_put_dax(ns->dax_dev);
++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++ set->ns_tbl[i] = NULL;
++ set->attached_ns--;
++ kfree(ns);
++ }
++ }
++
++ if (set->attached_ns)
++ pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++ release_ns_tbl(set);
++ kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_set_header *set_header;
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++ ns->ns_id);
++ return -EINVAL;
++ }
++
++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++ mutex_lock(&global_nvmpg_set->lock);
++ global_nvmpg_set->set_header = set_header;
++ global_nvmpg_set->heads_size = set_header->size;
++ global_nvmpg_set->heads_used = set_header->used;
++ mutex_unlock(&global_nvmpg_set->lock);
++
++ return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++ struct bch_nvmpg_sb *sb = ns->sb;
++ int rc = 0;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++ pr_err("ns_id %u already attached.\n", ns->ns_id);
++ rc = -EEXIST;
++ goto unlock;
++ }
++
++ if (ns->ns_id != 0) {
++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ if (global_nvmpg_set->attached_ns > 0) {
++ pr_err("multiple namespace attaching not supported yet\n");
++ rc = -EOPNOTSUPP;
++ goto unlock;
++ }
++
++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++ pr_err("namespace counters error: attached %u > total %u\n",
++ global_nvmpg_set->attached_ns,
++ global_nvmpg_set->total_ns);
++ rc = -EINVAL;
++ goto unlock;
++ }
++
++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++ global_nvmpg_set->attached_ns++;
++ global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++ struct bch_nvmpg_ns *ns)
++{
++ struct page *page;
++ struct bch_nvmpg_sb *sb;
++ uint64_t expected_csum = 0;
++ int r;
++
++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++ if (IS_ERR(page))
++ return -EIO;
++
++ sb = (struct bch_nvmpg_sb *)
++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++ r = -EINVAL;
++ expected_csum = csum_set(sb);
++ if (expected_csum != sb->csum) {
++ pr_info("csum is not match with expected one\n");
++ goto put_page;
++ }
++
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_info("invalid bch_nvmpg_magic\n");
++ goto put_page;
++ }
++
++ if (sb->sb_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++ goto put_page;
++ }
++
++ r = -EOPNOTSUPP;
++ if (sb->total_ns != 1) {
++ pr_info("multiple name space not supported yet.\n");
++ goto put_page;
++ }
++
++
++ r = 0;
++ /* Necessary for DAX mapping */
++ ns->page_size = sb->page_size;
++ ns->pages_total = sb->pages_total;
++
++put_page:
++ put_page(page);
++ return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ struct bch_nvmpg_ns *ns = NULL;
++ struct bch_nvmpg_sb *sb = NULL;
++ char buf[BDEVNAME_SIZE];
++ struct block_device *bdev;
++ pgoff_t pgoff;
++ int id, err;
++ char *path;
++ long dax_ret = 0;
++
++ path = kstrndup(dev_path, 512, GFP_KERNEL);
++ if (!path) {
++ pr_err("kstrndup failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ bdev = blkdev_get_by_path(strim(path),
++ FMODE_READ|FMODE_WRITE|FMODE_EXCL,
++ global_nvmpg_set);
++ if (IS_ERR(bdev)) {
++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++ kfree(path);
++ return ERR_PTR(PTR_ERR(bdev));
++ }
++
++ err = -ENOMEM;
++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++ if (!ns)
++ goto bdput;
++
++ err = -EIO;
++ if (read_nvdimm_meta_super(bdev, ns)) {
++ pr_err("%s read nvdimm meta super block failed.\n",
++ bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EOPNOTSUPP;
++ ns->dax_dev = fs_dax_get_by_bdev(bdev);
++ if (!ns->dax_dev) {
++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++ bdev_nr_sectors(bdev))) {
++ pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++ pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++ goto free_ns;
++ }
++
++ err = -EINVAL;
++ id = dax_read_lock();
++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++ &ns->base_addr, &ns->start_pfn);
++ if (dax_ret <= 0) {
++ pr_err("dax_direct_access error\n");
++ dax_read_unlock(id);
++ goto free_ns;
++ }
++
++ if (dax_ret < ns->pages_total) {
++ pr_warn("currently first %ld pages (from %lu in total) are used\n",
++ dax_ret, ns->pages_total);
++ }
++ dax_read_unlock(id);
++
++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++ err = -EINVAL;
++ /* Check magic again to make sure DAX mapping is correct */
++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++ goto free_ns;
++ }
++
++ if ((global_nvmpg_set->attached_ns > 0) &&
++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++ goto free_ns;
++ }
++
++ if (sb->set_header_offset !=
++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++ sb->this_ns,
++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++ goto free_ns;
++ }
++
++ ns->page_size = sb->page_size;
++ ns->pages_offset = sb->pages_offset;
++ ns->pages_total = sb->pages_total;
++ ns->sb = sb;
++ ns->free = 0;
++ ns->bdev = bdev;
++ ns->set = global_nvmpg_set;
++
++ err = attach_nvmpg_set(ns);
++ if (err < 0)
++ goto free_ns;
++
++ mutex_init(&ns->lock);
++
++ err = init_nvmpg_set_header(ns);
++ if (err < 0)
++ goto free_ns;
++
++ kfree(path);
++ return ns;
++
++free_ns:
++ fs_put_dax(ns->dax_dev);
++ kfree(ns);
++bdput:
++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++ kfree(path);
++ return ERR_PTR(err);
++}
++
++int __init bch_nvmpg_init(void)
++{
++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++ if (!global_nvmpg_set)
++ return -ENOMEM;
++
++ global_nvmpg_set->total_ns = 0;
++ mutex_init(&global_nvmpg_set->lock);
++
++ pr_info("bcache nvm init\n");
++ return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++ release_nvmpg_set(global_nvmpg_set);
++ pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++ struct bch_nvmpg_sb *sb;
++ void *base_addr;
++
++ unsigned char uuid[16];
++ int ns_id;
++ unsigned int page_size;
++ unsigned long free;
++ unsigned long pages_offset;
++ unsigned long pages_total;
++ pfn_t start_pfn;
++
++ struct dax_device *dax_dev;
++ struct block_device *bdev;
++ struct bch_nvmpg_set *set;
++
++ struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++ unsigned char set_uuid[16];
++
++ int heads_size;
++ int heads_used;
++ struct bch_nvmpg_set_header *set_header;
++
++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++ int total_ns;
++ int attached_ns;
++
++ struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS 3
++#define BCH_NVMPG_OFFSET_BITS 61
++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset) \
++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset) \
++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \
++ ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++ return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++ return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 86b9e355c583..74d51a0b806f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2818,6 +2819,7 @@ static void bcache_exit(void)
+ {
+ bch_debug_exit();
+ bch_request_exit();
++ bch_nvmpg_exit();
+ if (bcache_kobj)
+ kobject_put(bcache_kobj);
+ if (bcache_wq)
+@@ -2916,6 +2918,7 @@ static int __init bcache_init(void)
+
+ bch_debug_init();
+ closure_debug_init();
++ bch_nvmpg_init();
+
+ bcache_is_reboot = false;
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..9adcb46
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From ef9ee14f2d7b1dd38f8aebf190e9ed1527f688c2 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH v13 03/12] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+ struct { /* ZONE_DEVICE pages */
+ /** @pgmap: Points to the hosting device page map. */
+ struct dev_pagemap *pgmap;
+ void *zone_device_data;
+ /*
+ * ZONE_DEVICE private pages are counted as being
+ * mapped so the next 3 words hold the mapping, index,
+ * and private fields from the source anonymous or
+ * page cache page while the page is migrated to device
+ * private memory.
+ * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+ * use the mapping, index, and private fields when
+ * pmem backed DAX files are mapped.
+ */
+ };
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h | 12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index b654bbbda03e..2b70ee4a6028 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++ return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++ return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ pgoff_t pgoff = r->pgoff;
++
++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff, u64 nr)
++{
++ while (nr > 0) {
++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++ bitmap_set(ns->pages_bitmap, pgoff, num);
++ nr -= num;
++ pgoff += num;
++ }
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ ns = set->ns_tbl[i];
+ if (ns) {
++ kvfree(ns->pages_bitmap);
++ if (ns->recs_bitmap)
++ bitmap_free(ns->recs_bitmap);
++
+ fs_put_dax(ns->dax_dev);
+ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ kfree(set);
+ }
+
++static int validate_recs(int ns_id,
++ struct bch_nvmpg_head *head,
++ struct bch_nvmpg_recs *recs)
++{
++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++ pr_err("Invalid bch_nvmpg_recs magic\n");
++ return -EINVAL;
++ }
++
++ if (memcmp(recs->uuid, head->uuid, 16)) {
++ pr_err("Invalid bch_nvmpg_recs uuid\n");
++ return -EINVAL;
++ }
++
++ if (recs->head_offset !=
++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++ pr_err("Invalid recs head_offset\n");
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++ int i, used = 0;
++
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *r = &recs->recs[i];
++ struct bch_nvmpg_ns *ns;
++ struct page *page;
++ void *addr;
++
++ if (r->pgoff == 0)
++ continue;
++
++ ns = global_nvmpg_set->ns_tbl[r->ns_id];
++ addr = bch_nvmpg_rec_to_ptr(r);
++ if (addr < ns->base_addr) {
++ pr_err("Invalid recorded address\n");
++ return -EINVAL;
++ }
++
++ /* init struct page: index/private */
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, r->order);
++ page->index = r->pgoff;
++
++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++ used++;
++ }
++
++ if (used != recs->used) {
++ pr_err("used %d doesn't match recs->used %d\n",
++ used, recs->used);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_set_header *set_header;
++ struct bch_nvmpg_recs *sys_recs;
++ int i, j, used = 0, rc = 0;
+
+ if (ns->ns_id != 0) {
+ pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ global_nvmpg_set->set_header = set_header;
+ global_nvmpg_set->heads_size = set_header->size;
+ global_nvmpg_set->heads_used = set_header->used;
++
++ /* Reserve the used space from buddy allocator */
++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *head;
++
++ head = &set_header->heads[i];
++ if (head->state == BCH_NVMPG_HD_STAT_FREE)
++ continue;
++
++ used++;
++ if (used > global_nvmpg_set->heads_size) {
++ pr_err("used heads %d > heads size %d.\n",
++ used, global_nvmpg_set->heads_size);
++ goto unlock;
++ }
++
++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++ /* Iterate the recs list */
++ while (recs) {
++ rc = validate_recs(j, head, recs);
++ if (rc < 0)
++ goto unlock;
++
++ rc = reserve_nvmpg_recs(recs);
++ if (rc < 0)
++ goto unlock;
++
++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++ }
++ }
++unlock:
+ mutex_unlock(&global_nvmpg_set->lock);
++ return rc;
++}
+
+- return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++ unsigned int start, end, pages;
++ int i;
++ struct page *page;
++ pgoff_t pgoff_start;
++
++ bitmap_for_each_clear_region(ns->pages_bitmap,
++ start, end, 0, ns->pages_total) {
++ pgoff_start = start;
++ pages = end - start;
++
++ while (pages) {
++ void *addr;
++
++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++ if ((pgoff_start % (1L << i) == 0) &&
++ (pages >= (1L << i)))
++ break;
++ }
++
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++ page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(page, i);
++ page->index = pgoff_start;
++ __SetPageBuddy(page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[i]);
++
++ pgoff_start += 1L << i;
++ pages -= 1L << i;
++ }
++ }
+ }
+
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ char buf[BDEVNAME_SIZE];
+ struct block_device *bdev;
+ pgoff_t pgoff;
+- int id, err;
++ int id, i, err;
+ char *path;
+ long dax_ret = 0;
+
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+
+ mutex_init(&ns->lock);
+
++ /*
++ * parameters of bitmap_set/clear are unsigned int.
++ * Given currently size of nvm is far from exceeding this limit,
++ * so only add a WARN_ON message.
++ */
++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++ sizeof(unsigned long), GFP_KERNEL);
++ if (!ns->pages_bitmap) {
++ err = -ENOMEM;
++ goto clear_ns_nr;
++ }
++
++ if (ns->sb->this_ns == 0) {
++ ns->recs_bitmap =
++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++ if (ns->recs_bitmap == NULL) {
++ err = -ENOMEM;
++ goto free_pages_bitmap;
++ }
++ }
++
++ for (i = 0; i < BCH_MAX_ORDER; i++)
++ INIT_LIST_HEAD(&ns->free_area[i]);
++
+ err = init_nvmpg_set_header(ns);
+ if (err < 0)
+- goto free_ns;
++ goto free_recs_bitmap;
++
++ if (ns->sb->this_ns == 0)
++ /* init buddy allocator */
++ bch_nvmpg_init_free_space(ns);
+
+ kfree(path);
+ return ns;
+
++free_recs_bitmap:
++ bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++ kvfree(ns->pages_bitmap);
++clear_ns_nr:
++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ fs_put_dax(ns->dax_dev);
+ kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+ * Bcache NVDIMM in memory data structures
+ */
+
++#define BCH_MAX_ORDER 20
++
+ /*
+ * The following three structures in memory records which page(s) allocated
+ * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ unsigned long pages_total;
+ pfn_t start_pfn;
+
++ unsigned long *pages_bitmap;
++ struct list_head free_area[BCH_MAX_ORDER];
++
++ unsigned long *recs_bitmap;
++
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_MAX_PGALLOC_RECS \
++ (min_t(unsigned int, 64, \
++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
++ sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..ef13f6e
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,308 @@
+From b09e24d84a7ae11be4bd7255648ebd5006678029 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH v13 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h | 9 ++
+ 2 files changed, 230 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 2b70ee4a6028..a920779eb548 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ return NULL;
+ }
+
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++ pgoff_t pgoff)
++{
++ int ns_id = ns->ns_id;
++ unsigned long offset = pgoff << PAGE_SHIFT;
++
++ return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ }
+ }
+
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++ struct bch_nvmpg_head *head = NULL;
++ int i;
++
++ if (set_header == NULL)
++ goto out;
++
++ for (i = 0; i < set_header->size; i++) {
++ struct bch_nvmpg_head *h = &set_header->heads[i];
++
++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++ continue;
++
++ if (!memcmp(uuid, h->uuid, 16)) {
++ head = h;
++ break;
++ }
++ }
++
++ if (!head && create) {
++ u32 used = set_header->used;
++
++ if (set_header->size > used) {
++ head = &set_header->heads[used];
++ memset(head, 0, sizeof(struct bch_nvmpg_head));
++ head->state = BCH_NVMPG_HD_STAT_ALLOC;
++ memcpy(head->uuid, uuid, 16);
++ global_nvmpg_set->heads_used++;
++ set_header->used++;
++ } else
++ pr_info("No free bch_nvmpg_head\n");
++ }
++
++out:
++ return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++ unsigned int start;
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++ struct bch_nvmpg_recs *recs;
++
++ start = bitmap_find_next_zero_area(ns->recs_bitmap,
++ BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++ if (start > BCH_MAX_PGALLOC_RECS) {
++ pr_info("No free struct bch_nvmpg_recs\n");
++ return NULL;
++ }
++
++ bitmap_set(ns->recs_bitmap, start, 1);
++ recs = (struct bch_nvmpg_recs *)
++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++ + start;
++
++ memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++ return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_head *head,
++ bool create)
++{
++ int ns_id = ns->sb->this_ns;
++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++ /* If create=false, we return recs[nr] */
++ if (!create)
++ return recs;
++
++ /*
++ * If create=true, it mean we need a empty struct bch_nvmpg_rec
++ * So we should find non-empty struct bch_nvmpg_recs or alloc
++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++ */
++ while (recs && (recs->used == recs->size)) {
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++
++ /* Found empty struct bch_nvmpg_recs */
++ if (recs)
++ return recs;
++
++ /* Need alloc new struct bch_nvmpg_recs */
++ recs = find_empty_nvmpg_recs();
++ if (recs) {
++ unsigned long offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++ memcpy(recs->uuid, head->uuid, 16);
++ recs->size = BCH_NVMPG_MAX_RECS;
++ recs->used = 0;
++
++ offset = bch_nvmpg_ptr_to_offset(ns, recs);
++ if (prev_recs)
++ prev_recs->next_offset = offset;
++ else
++ head->recs_offset[ns_id] = offset;
++ }
++
++ return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++ struct bch_nvmpg_recs *recs,
++ unsigned long nvmpg_offset,
++ int order)
++{
++ int i, ns_id;
++ unsigned long pgoff;
++
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++ ns_id = ns->sb->this_ns;
++
++ for (i = 0; i < recs->size; i++) {
++ if (recs->recs[i].pgoff == 0) {
++ recs->recs[i].pgoff = pgoff;
++ recs->recs[i].order = order;
++ recs->recs[i].ns_id = ns_id;
++ recs->used++;
++ break;
++ }
++ }
++ BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ unsigned long nvmpg_offset = 0;
++ struct bch_nvmpg_head *head;
++ int n, o;
++
++ mutex_lock(&global_nvmpg_set->lock);
++ head = find_nvmpg_head(uuid, true);
++
++ if (!head) {
++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++ goto unlock;
++ }
++
++ for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++ if (!ns || (ns->free < (1L << order)))
++ continue;
++
++ for (o = order; o < BCH_MAX_ORDER; o++) {
++ struct list_head *list;
++ struct page *page, *buddy_page;
++
++ if (list_empty(&ns->free_area[o]))
++ continue;
++
++ list = ns->free_area[o].next;
++ page = container_of((void *)list, struct page,
++ zone_device_data);
++
++ list_del(list);
++
++ while (o != order) {
++ void *addr;
++ pgoff_t pgoff;
++
++ pgoff = page->index + (1L << (o - 1));
++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(addr);
++ set_page_private(buddy_page, o - 1);
++ buddy_page->index = pgoff;
++ __SetPageBuddy(buddy_page);
++ list_add((struct list_head *)&buddy_page->zone_device_data,
++ &ns->free_area[o - 1]);
++ o--;
++ }
++
++ set_page_private(page, order);
++ __ClearPageBuddy(page);
++ ns->free -= 1L << order;
++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++ break;
++ }
++
++ if (o < BCH_MAX_ORDER) {
++ struct bch_nvmpg_recs *recs;
++
++ recs = find_nvmpg_recs(ns, head, true);
++ /* ToDo: handle pgalloc_recs==NULL */
++ add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++ break;
++ }
++ }
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++ return nvmpg_offset;
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */
+
++#define BCH_PGOFF_TO_KVADDR(pgoff) \
++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS \
+ (min_t(unsigned int, 64, \
+ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+
+ #else
+
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+
+ static inline void bch_nvmpg_exit(void) { }
+
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++ return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch
new file mode 100644
index 0000000..fd631ae
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch
@@ -0,0 +1,252 @@
+From 1f1fd2517b0a3520ab3a78cabe737cfb1f628d2e Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH v13 05/12] bcache: bch_nvmpg_free_pages() of the buddy
+ allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h | 3 +
+ 2 files changed, 160 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index a920779eb548..8ce0c4389b42 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ return rc;
+ }
+
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++ int order)
++{
++ unsigned long add_pages = (1L << order);
++ pgoff_t pgoff;
++ struct page *page;
++ void *va;
++
++ if (nvmpg_offset == 0) {
++ pr_err("free pages on offset 0\n");
++ return;
++ }
++
++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++ WARN_ON((!page) || (page->private != order));
++ pgoff = page->index;
++
++ while (order < BCH_MAX_ORDER - 1) {
++ struct page *buddy_page;
++
++ pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++ pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++ break;
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++ buddy_page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!buddy_page);
++
++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++ list_del((struct list_head *)&buddy_page->zone_device_data);
++ __ClearPageBuddy(buddy_page);
++ pgoff = parent_pgoff;
++ order++;
++ continue;
++ }
++ break;
++ }
++
++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++ page = bch_nvmpg_va_to_pg(va);
++ WARN_ON(!page);
++ list_add((struct list_head *)&page->zone_device_data,
++ &ns->free_area[order]);
++ page->index = pgoff;
++ set_page_private(page, order);
++ __SetPageBuddy(page);
++ ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ pages = end - start;
+
+ while (pages) {
+- void *addr;
+-
+ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ if ((pgoff_start % (1L << i) == 0) &&
+ (pages >= (1L << i)))
+ break;
+ }
+
+- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+- page = bch_nvmpg_va_to_pg(addr);
++ page = bch_nvmpg_va_to_pg(
++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ set_page_private(page, i);
+ page->index = pgoff_start;
+- __SetPageBuddy(page);
+- list_add((struct list_head *)&page->zone_device_data,
+- &ns->free_area[i]);
++
++ /* In order to update ns->free */
++ __free_space(ns, pgoff_start, i);
+
+ pgoff_start += 1L << i;
+ pages -= 1L << i;
+@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return nvmpg_offset;
+ }
+
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++ void *start_addr, void *end_addr)
++{
++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++ unsigned long nvmpg_offset, int order)
++{
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *prev_recs, *sys_recs;
++ struct bch_nvmpg_ns *ns;
++ unsigned long pgoff;
++ int i;
++
++ ns = global_nvmpg_set->ns_tbl[0];
++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++ head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++ prev_recs = recs;
++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++ while (recs) {
++ for (i = 0; i < recs->size; i++) {
++ struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++ WARN_ON(rec->order != order);
++ rec->_v = 0;
++ recs->used--;
++
++ if (recs->used == 0) {
++ int recs_pos = recs - sys_recs;
++
++ if (recs == prev_recs)
++ head->recs_offset[ns_id] =
++ recs->next_offset;
++ else
++ prev_recs->next_offset =
++ recs->next_offset;
++
++ recs->next_offset = 0;
++ recs->head_offset = 0;
++
++ bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++ }
++ goto out;
++ }
++ }
++ prev_recs = recs;
++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++ }
++out:
++ return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++ const char *uuid)
++{
++ struct bch_nvmpg_ns *ns;
++ struct bch_nvmpg_head *head;
++ struct bch_nvmpg_recs *recs;
++ int r;
++
++ mutex_lock(&global_nvmpg_set->lock);
++
++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++ if (!ns) {
++ pr_err("can't find namespace by given kaddr from namespace\n");
++ goto unlock;
++ }
++
++ head = find_nvmpg_head(uuid, false);
++ if (!head) {
++ pr_err("can't found bch_nvmpg_head by uuid\n");
++ goto unlock;
++ }
++
++ recs = find_nvmpg_recs(ns, head, false);
++ if (!recs) {
++ pr_err("can't find bch_nvmpg_recs by uuid\n");
++ goto unlock;
++ }
++
++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++ if (r < 0) {
++ pr_err("can't find bch_nvmpg_rec\n");
++ goto unlock;
++ }
++
++ __free_space(ns, nvmpg_offset, order);
++
++unlock:
++ mutex_unlock(&global_nvmpg_set->lock);
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ struct bch_nvmpg_sb *sb = ns->sb;
+@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ ns->pages_offset = sb->pages_offset;
+ ns->pages_total = sb->pages_total;
+ ns->sb = sb;
++ /* increase by __free_space() */
+ ns->free = 0;
+ ns->bdev = bdev;
+ ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+
+ #else
+
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return 0;
+ }
+
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch
new file mode 100644
index 0000000..f055b17
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch
@@ -0,0 +1,66 @@
+From 953f817e496a1a74b9a8403800bf1d7f0f5b4aeb Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH v13 06/12] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 5 +++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 8ce0c4389b42..e26c7b578a62 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ return nvmpg_offset;
+ }
+
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return find_nvmpg_head(uuid, false);
++}
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+
+ #else
+
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++ return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+
+ #endif /* _BCACHE_NVM_PAGES_H */
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch
new file mode 100644
index 0000000..4ae5f06
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch
@@ -0,0 +1,48 @@
+From 566cc2016c7e817b8306db96d97c3e4cdbc254df Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH v13 07/12] bcache: use bucket index to set GC_MARK_METADATA
+ for journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 88c573eeb598..1a0ff117373f 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ ca = c->cache;
+ ca->invalidate_needs_gc = 0;
+
+- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */
++ for (i = ca->sb.first_bucket;
++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+
+ for (k = ca->prio_buckets;
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch
new file mode 100644
index 0000000..1e0fb3b
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch
@@ -0,0 +1,60 @@
+From 5da7b9cfe8c6344a6a4271bf3878d22ba87f4398 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH v13 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004
+
+ #define BCH_FEATURE_COMPAT_SUPP 0
+ #define BCH_FEATURE_RO_COMPAT_SUPP 0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++ BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch
new file mode 100644
index 0000000..3e63f08
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch
@@ -0,0 +1,255 @@
+From 6795c385696ab16a78e7b9cce7310a50a2522af5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH v13 09/12] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+ list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+ from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h | 2 +-
+ drivers/md/bcache/nvmpg.c | 9 +++
+ drivers/md/bcache/nvmpg.h | 1 +
+ drivers/md/bcache/super.c | 18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+
+ #include <trace/events/bcache.h>
+
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+
+ return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++ struct cache *ca)
++{
++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++ unsigned long ret_offset = 0;
++ int i;
++
++ jnl_offset = (unsigned long)ca->sb.d[0];
++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++ struct bch_nvmpg_recs *recs;
++ struct bch_nvmpg_rec *rec;
++ unsigned long recs_offset = 0;
++ int j;
++
++ recs_offset = nvmpg_head->recs_offset[i];
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ while (recs) {
++ for (j = 0; j < recs->size; j++) {
++ rec = &recs->recs[j];
++ if ((rec->pgoff != jnl_pgoff) ||
++ (rec->ns_id != jnl_ns_id))
++ continue;
++
++ ret_offset = jnl_offset;
++ goto out;
++ }
++ recs_offset = recs->next_offset;
++ recs = bch_nvmpg_offset_to_ptr(recs_offset);
++ }
++ }
++
++out:
++ return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++ struct bch_nvmpg_head *head = NULL;
++ unsigned long nvmpg_offset;
++ int order;
++
++ head = bch_get_nvmpg_head(ca->sb.set_uuid);
++ if (head) {
++ nvmpg_offset = find_journal_nvmpg_base(head, ca);
++ if (nvmpg_offset)
++ goto found;
++ }
++
++ order = ilog2((ca->sb.bucket_size *
++ ca->sb.njournal_buckets) / PAGE_SECTORS);
++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++ if (nvmpg_offset)
++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++ 0, (1 << order) * PAGE_SIZE);
++found:
++ return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++ int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ int i;
++ unsigned long jnl_base = 0;
++
++ jnl_base = get_journal_nvmpg_space(ca);
++ if (!jnl_base) {
++ pr_err("Failed to get journal space from nvdimm\n");
++ goto out;
++ }
++
++ /* Iniialized and reloaded from on-disk super block already */
++ if (ca->sb.d[0] != 0)
++ goto out;
++
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++ ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++ return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++ int i, ret = 0;
++ struct cache *ca = c->cache;
++
++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++ 2, SB_JOURNAL_BUCKETS);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++ for (i = 0; i < ca->sb.keys; i++)
++ ca->sb.d[i] = ca->sb.first_bucket + i;
++ } else
++ ret = __bch_journal_nvdimm_init(ca);
++
++ return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index e26c7b578a62..1a3c6327b091 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+
+ struct bch_nvmpg_set *global_nvmpg_set;
+
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++ return global_nvmpg_set->ns_tbl[ns_id];
++
++ pr_emerg("Invalid ns_id: %d\n", ns_id);
++ return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+
+ #else
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 74d51a0b806f..a27fa65d8832 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device *
+ goto err;
+
+ err = "Journal buckets not sequential";
+- for (i = 0; i < sb->keys; i++)
+- if (sb->d[i] != sb->first_bucket + i)
+- goto err;
++ if (!bch_has_feature_nvdimm_meta(sb)) {
++ for (i = 0; i < sb->keys; i++)
++ if (sb->d[i] != sb->first_bucket + i)
++ goto err;
++ }
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c)
+ if (bch_journal_replay(c, &journal))
+ goto err;
+ } else {
+- unsigned int j;
+-
+ pr_notice("invalidating existing data\n");
+- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+- 2, SB_JOURNAL_BUCKETS);
+-
+- for (j = 0; j < ca->sb.keys; j++)
+- ca->sb.d[j] = ca->sb.first_bucket + j;
++ err = "error initializing journal";
++ if (bch_journal_init(c))
++ goto err;
+
+ bch_initial_gc_finish(c);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch
new file mode 100644
index 0000000..977fff6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch
@@ -0,0 +1,231 @@
+From 04919917230c65aa07f65a57a136f7994b017faf Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH v13 10/12] bcache: support storing bcache journal into NVDIMM
+ meta device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+ The latency method to compose bio and issue the jset bio to cache
+ device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+ device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+ Use memory interface to access NVDIMM pages and store the jset by
+ memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+ address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c | 3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ return;
+ }
+
++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ switch (atomic_read(&ja->discard_in_flight)) {
+ case DISCARD_IN_FLIGHT:
+ return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ goto out;
+
+ ja->cur_idx = next;
+- k->ptr[0] = MAKE_PTR(0,
+- bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+- ca->sb.nr_this_dev);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ k->ptr[0] = MAKE_PTR(0,
++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++ ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++ ca->sb.d[ja->cur_idx]);
++#endif
++
+ atomic_long_inc(&c->reclaimed_journal_buckets);
+
+ bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+ }
+
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ __releases(c->journal.lock)
+ {
+- struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+- struct cache *ca = c->cache;
+- struct journal_write *w = c->journal.cur;
+ struct bkey *k = &c->journal.key;
+- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+- ca->sb.block_size;
+-
++ struct journal_write *w = c->journal.cur;
++ struct closure *cl = &c->journal.io;
++ struct cache *ca = c->cache;
+ struct bio *bio;
+ struct bio_list list;
++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++ ca->sb.block_size;
+
+ bio_list_init(&list);
+
+- if (!w->need_write) {
+- closure_return_with_destructor(cl, journal_write_unlock);
+- return;
+- } else if (journal_full(&c->journal)) {
+- journal_reclaim(c);
+- spin_unlock(&c->journal.lock);
+-
+- btree_flush_write(c);
+- continue_at(cl, journal_write, bch_journal_wq);
+- return;
+- }
+-
+- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+- w->data->btree_level = c->root->level;
+-
+- bkey_copy(&w->data->btree_root, &c->root->key);
+- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+- w->data->magic = jset_magic(&ca->sb);
+- w->data->version = BCACHE_JSET_VERSION;
+- w->data->last_seq = last_seq(&c->journal);
+- w->data->csum = csum_set(w->data);
+-
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ ca = c->cache;
+ bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+
+ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ }
+-
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+
+ while ((bio = bio_list_pop(&list)))
+ closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++ __releases(c->journal.lock)
++{
++ struct journal_write *w = c->journal.cur;
++ struct cache *ca = c->cache;
++ unsigned int sectors;
++
++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++ atomic_long_add(sectors, &ca->meta_sectors_written);
++
++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++ c->journal.key.ptr[0] += sectors << 9;
++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++ atomic_dec_bug(&fifo_back(&c->journal.pin));
++ bch_journal_next(&c->journal);
++ journal_reclaim(c);
++
++ spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++ struct cache *ca = c->cache;
++ struct journal_write *w = c->journal.cur;
++
++ if (!w->need_write) {
++ closure_return_with_destructor(cl, journal_write_unlock);
++ return;
++ } else if (journal_full(&c->journal)) {
++ journal_reclaim(c);
++ spin_unlock(&c->journal.lock);
++
++ btree_flush_write(c);
++ continue_at(cl, journal_write, bch_journal_wq);
++ return;
++ }
++
++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++ w->data->btree_level = c->root->level;
++
++ bkey_copy(&w->data->btree_root, &c->root->key);
++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++ w->data->magic = jset_magic(&ca->sb);
++ w->data->version = BCACHE_JSET_VERSION;
++ w->data->last_seq = last_seq(&c->journal);
++ w->data->csum = csum_set(w->data);
++
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ __journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ __journal_nvdimm_write_unlocked(c);
++#endif
+
+ continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a27fa65d8832..45b69ddc9cfa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ struct cache_set *c = container_of(cl, struct cache_set, cl);
+- struct cache *ca;
++ struct cache *ca = c->cache;
+
+ debugfs_remove(c->debug);
+
+@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl)
+ bch_bset_sort_state_free(&c->sort);
+ free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+
+- ca = c->cache;
+ if (ca) {
+ ca->set = NULL;
+ c->cache = NULL;
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch
new file mode 100644
index 0000000..77ca2b5
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch
@@ -0,0 +1,182 @@
+From 2e1f37377d63412b139e8aa55a8731bf95c91767 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH v13 11/12] bcache: read jset from NVDIMM pages for journal
+ replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+ This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+ This is the method to read jset from NVDIMM memory interface, a.k.a
+ memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset,
++ struct closure *cl)
++{
++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++ struct bio *bio = &ca->journal.bio;
++ struct jset *data = ca->set->journal.w[0].data;
++
++ bio_reset(bio);
++ bio->bi_iter.bi_sector = bucket + offset;
++ bio_set_dev(bio, ca->bdev);
++ bio->bi_iter.bi_size = len << 9;
++
++ bio->bi_end_io = journal_read_endio;
++ bio->bi_private = cl;
++ bio_set_op_attrs(bio, REQ_OP_READ, 0);
++ bch_bio_map(bio, data);
++
++ closure_bio_submit(ca->set, bio, cl);
++ closure_sync(cl);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++ unsigned int len, unsigned int offset)
++{
++ void *jset_addr;
++ struct jset *data;
++
++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++ data = ca->set->journal.w[0].data;
++
++ memcpy(data, jset_addr, len << 9);
++
++ /* Indeed journal.w[0].data */
++ return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ unsigned int bucket_index)
+ {
+ struct journal_device *ja = &ca->journal;
+- struct bio *bio = &ja->bio;
+
+ struct journal_replay *i;
+- struct jset *j, *data = ca->set->journal.w[0].data;
++ struct jset *j;
+ struct closure cl;
+ unsigned int len, left, offset = 0;
+ int ret = 0;
+- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+ closure_init_stack(&cl);
+
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread: left = ca->sb.bucket_size - offset;
+ len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+
+- bio_reset(bio);
+- bio->bi_iter.bi_sector = bucket + offset;
+- bio_set_dev(bio, ca->bdev);
+- bio->bi_iter.bi_size = len << 9;
+-
+- bio->bi_end_io = journal_read_endio;
+- bio->bi_private = &cl;
+- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+- bch_bio_map(bio, data);
+-
+- closure_bio_submit(ca->set, bio, &cl);
+- closure_sync(&cl);
++ if (!bch_has_feature_nvdimm_meta(&ca->sb))
++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++ /*
++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++ * support feature set, a cache device format with feature bit
++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++ * read_super() by bch_has_unknown_incompat_features().
++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++ * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++ */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ else
++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+
+ /* This function could be simpler now since we no longer write
+ * journal entries that overlap bucket boundaries; this means
+ * the start of a bucket will always have a valid journal entry
+ * if it has any journal entries at all.
+ */
+-
+- j = data;
+ while (len) {
+ struct list_head *where;
+ size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset;
+ return ret;
+ }
+
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b) \
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ unsigned int i, l, r, m;
+ uint64_t seq;
+
++ /*
++ * Linear addresses of NVDIMM pages for journaling is not
++ * initialized yet, do it before read jset from NVDIMM pages.
++ */
++ if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++ if (__bch_journal_nvdimm_init(ca) < 0)
++ return -ENXIO;
++ }
++
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+
+--
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch
new file mode 100644
index 0000000..b2f0330
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch
@@ -0,0 +1,84 @@
+From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH v13 12/12] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+ echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 45b69ddc9cfa..2b9cde44879b 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++ struct kobj_attribute *attr,
++ const char *buffer, size_t size);
++#endif
+
+ kobj_attribute_write(register, register_bcache);
+ kobj_attribute_write(register_quiet, register_bcache);
+ kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args)
+ queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++ const char *buffer, size_t size)
++{
++ ssize_t ret = size;
++
++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++ if (IS_ERR(ns)) {
++ pr_err("register nvdimm namespace %s for meta device failed.\n",
++ buffer);
++ ret = -EINVAL;
++ }
++
++ return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+ {
+@@ -2864,6 +2890,9 @@ static int __init bcache_init(void)
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++ &ksysfs_register_nvdimm_meta.attr,
++#endif
+ &ksysfs_pendings_cleanup.attr,
+ NULL
+ };
+--
+2.31.1
+
diff --git a/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch
new file mode 100644
index 0000000..d2727ea
--- /dev/null
+++ b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch
@@ -0,0 +1,166 @@
+From 8ddc4c14ecef71ebc56d86ad0fd6721d348898d0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 29 Mar 2022 00:08:49 +0800
+Subject: [PATCH] bcache: avoid unnecessary soft lockup in kworker
+ update_writeback_rate()
+
+The kworker routine update_writeback_rate() is schedued to update the
+writeback rate in every 5 seconds by default. Before calling
+__update_writeback_rate() to do real job, semaphore dc->writeback_lock
+should be held by the kworker routine.
+
+At the same time, bcache writeback thread routine bch_writeback_thread()
+also needs to hold dc->writeback_lock before flushing dirty data back
+into the backing device. If the dirty data set is large, it might be
+very long time for bch_writeback_thread() to scan all dirty buckets and
+releases dc->writeback_lock. In such case update_writeback_rate() can be
+starved for long enough time so that kernel reports a soft lockup warn-
+ing started like:
+ watchdog: BUG: soft lockup - CPU#246 stuck for 23s! [kworker/246:31:179713]
+
+Such soft lockup condition is unnecessary, because after the writeback
+thread finishes its job and releases dc->writeback_lock, the kworker
+update_writeback_rate() may continue to work and everything is fine
+indeed.
+
+This patch avoids the unnecessary soft lockup by the following method,
+- Add new members to struct cached_dev
+ - dc->retry_nr (0 by default)
+ - dc->retry_max (6 by default)
+- In update_writeback_rate() call down_read_trylock(&dc->writeback_lock)
+ firstly, if it fails then lock contention happens. If dc->retry_nr is
+ smaller than dc->retry_max, increase 1 to dc->retry_nr, and reschedule
+ the kworker to retry after a bit long time.
+- If lock contention happens and dc->retry_nr is equal to dc->retry_max,
+ no retry anymore and call down_read(&dc->writeback_lock) to wait for the
+ lock.
+
+By the above method, at worst case update_writeback_rate() may retry for
+2+ minutes before blocking on dc->writeback_lock by calling down_read().
+For a 4TB cache device with 1TB dirty data, 90%+ of the unnecessary soft
+lockup warning message can be avoided.
+
+When retrying to acquire dc->writeback_lock in update_writeback_rate(),
+of course the writeback rate cannot be updated. It is fair, because when
+the kworker is blocked on the lock contention of dc->writeback_lock, the
+writeback rate cannot be updated neither.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h | 7 +++++
+ drivers/md/bcache/writeback.c | 49 +++++++++++++++++++++++++++++++----
+ 2 files changed, 51 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 9ed9c955add7..82b86b874294 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -395,6 +395,13 @@ struct cached_dev {
+ atomic_t io_errors;
+ unsigned int error_limit;
+ unsigned int offline_seconds;
++
++ /*
++ * Retry to update writeback_rate if contention happens for
++ * down_read(dc->writeback_lock) in update_writeback_rate()
++ */
++ unsigned int retry_nr;
++ unsigned int retry_max;
+ };
+
+ enum alloc_reserve {
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 9ee0005874cd..dbe90b9b2940 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -214,6 +214,7 @@ static void update_writeback_rate(struct work_struct *work)
+ struct cached_dev,
+ writeback_rate_update);
+ struct cache_set *c = dc->disk.c;
++ bool contention = false;
+
+ /*
+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+@@ -235,6 +236,7 @@ static void update_writeback_rate(struct work_struct *work)
+ return;
+ }
+
++
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+@@ -243,13 +245,44 @@ static void update_writeback_rate(struct work_struct *work)
+ * in maximum writeback rate number(s).
+ */
+ if (!set_at_max_writeback_rate(c, dc)) {
+- down_read(&dc->writeback_lock);
+- __update_writeback_rate(dc);
+- update_gc_after_writeback(c);
+- up_read(&dc->writeback_lock);
++ /*
++ * When contention happens on dc->writeback_lock with
++ * the writeback thread, this kwork may be blocked for
++ * very long time if there are too many dirty data to
++ * writeback, and kerne message will complain a (bogus)
++ * software lockup kernel message. To avoid potential
++ * starving, if down_read_trylock() fails, writeback
++ * rate updating will be skipped for dc->retry_max times
++ * at most while delay this worker a bit longer time.
++ * If dc->retry_max times are tried and the trylock
++ * still fails, then call down_read() to wait for
++ * dc->writeback_lock.
++ */
++ if (!down_read_trylock((&dc->writeback_lock))) {
++ contention = true;
++
++ if (dc->retry_nr < dc->retry_max) {
++ dc->retry_nr++;
++ } else {
++ down_read(&dc->writeback_lock);
++ dc->retry_nr = 0;
++ }
++ }
++
++ if (!dc->retry_nr) {
++ __update_writeback_rate(dc);
++ update_gc_after_writeback(c);
++ up_read(&dc->writeback_lock);
++ }
+ }
+ }
+
++ /*
++ * In case no lock contention on dc->writeback_lock happens since
++ * last retry, e.g. cache is clean or I/O idle for a while.
++ */
++ if (!contention && dc->retry_nr)
++ dc->retry_nr = 0;
+
+ /*
+ * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+@@ -257,8 +290,10 @@ static void update_writeback_rate(struct work_struct *work)
+ */
+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
++ unsigned int scale = 1 + dc->retry_nr;
++
+ schedule_delayed_work(&dc->writeback_rate_update,
+- dc->writeback_rate_update_seconds * HZ);
++ dc->writeback_rate_update_seconds * scale * HZ);
+ }
+
+ /*
+@@ -1032,6 +1067,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ dc->writeback_rate_fp_term_high = 1000;
+ dc->writeback_rate_i_term_inverse = 10000;
+
++ /* For dc->writeback_lock contention in update_writeback_rate() */
++ dc->retry_nr = 0;
++ dc->retry_max = 6;
++
+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+ }
+--
+2.34.1
+
diff --git a/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch
new file mode 100644
index 0000000..b1b4bae
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch
@@ -0,0 +1,92 @@
+From db29a2e95f4ad4ec1ba58a71203a60ebd867d8c9 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 10:57:26 +0800
+Subject: [PATCH 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+ This structure is used in improved badblocks code for bad table
+ iteration.
+- BB_END()
+ The macro to culculate end LBA of a bad range record from bad
+ table.
+- badblocks_full() and badblocks_empty()
+ The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+ The inline routines to set and clear 'changed' tag from struct
+ badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 32 ++++++++++++++++++++++++++++++++
+ 1 file changed, 32 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..e1a06bacb2a2 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x) (!!((x) & BB_ACK_MASK))
++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,14 @@ struct badblocks {
+ sector_t size; /* in sectors */
+ };
+
++struct badblocks_context {
++ sector_t start;
++ sector_t len;
++ int ack;
++ sector_t orig_start;
++ sector_t orig_len;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +72,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ }
+ badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++ return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++ return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++ if (bb->changed != 1)
++ bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++ if (bb->changed != 0)
++ bb->changed = 0;
++}
++
+ #endif
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch
new file mode 100644
index 0000000..62198ee
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch
@@ -0,0 +1,456 @@
+From d24ea1527077d06b0b579bbf7d1128d94af15d70 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+ range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+ manipulating range and existing bad block range from bad table.
+ - can_merge_behind()
+ Return 'true' if the manipulating range can backward merge with the
+ bad block range.
+ - can_merge_front()
+ Return 'true' if the manipulating range can forward merge with the
+ bad block range.
+ - can_combine_front()
+ Return 'true' if two adjacent bad block ranges before the
+ manipulating range can be merged.
+ - overlap_front()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range in front of its range.
+ - overlap_behind()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range behind its range.
+ - can_front_overwrite()
+ Return 'true' if the manipulating range can forward overwrite the
+ bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+ block table. Different routine is called with the specific relative
+ layout between the maniplating range and other bad block range in the
+ bad block table.
+ - behind_merge()
+ Merge the maniplating range with the bad block range behind its
+ range, and return the number of merged length in unit of sector.
+ - front_merge()
+ Merge the maniplating range with the bad block range in front of
+ its range, and return the number of merged length in unit of sector.
+ - front_combine()
+ Combine the two adjacent bad block ranges before the manipulating
+ range into a larger one.
+ - front_overwrite()
+ Overwrite partial of whole bad block range which is in front of the
+ manipulating range. The overwrite may split existing bad block range
+ and generate more bad block ranges into the bad block table.
+ - insert_at()
+ Insert the manipulating range at a specific location in the bad
+ block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 374 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..e85a7cd23aad 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,380 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++ int hint_end = hint + 2;
++ u64 *p = bb->page;
++ int ret = -1;
++
++ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++ (BB_OFFSET(p[hint]) <= s)) {
++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++ ret = hint;
++ break;
++ }
++ hint++;
++ }
++
++ return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++ int hint)
++{
++ sector_t s = bad->start;
++ int ret = -1;
++ int lo, hi;
++ u64 *p;
++
++ if (!bb->count)
++ goto out;
++
++ if (hint >= 0) {
++ ret = prev_by_hint(bb, s, hint);
++ if (ret >= 0)
++ goto out;
++ }
++
++ lo = 0;
++ hi = bb->count;
++ p = bb->page;
++
++ while (hi - lo > 1) {
++ int mid = (lo + hi)/2;
++ sector_t a = BB_OFFSET(p[mid]);
++
++ if (a <= s)
++ lo = mid;
++ else
++ hi = mid;
++ }
++
++ if (BB_OFFSET(p[lo]) <= s)
++ ret = lo;
++out:
++ return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++
++ if ((s <= BB_OFFSET(p[behind])) &&
++ ((s + sectors) >= BB_OFFSET(p[behind])) &&
++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++ BB_ACK(p[behind]) == ack)
++ return true;
++ return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_OFFSET(p[behind]));
++ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++ if (s < BB_OFFSET(p[behind])) {
++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, ack);
++ } else {
++ merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++ }
++
++ WARN_ON(merged == 0);
++
++ return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++
++ if (BB_ACK(p[prev]) == ack &&
++ (s < BB_END(p[prev]) ||
++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++ return true;
++ return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_END(p[prev]));
++
++ if (s < BB_END(p[prev])) {
++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++ } else {
++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++ if ((prev + 1) < bb->count &&
++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++ }
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + merged, ack);
++ }
++
++ return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if ((prev > 0) &&
++ (BB_OFFSET(p[prev]) == bad->start) &&
++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++ return true;
++ return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++ u64 *p = bb->page;
++
++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++ BB_ACK(p[prev]));
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if (bad->start >= BB_OFFSET(p[front]) &&
++ bad->start < BB_END(p[front]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ u64 *p = bb->page;
++
++ if (bad->start < BB_OFFSET(p[behind]) &&
++ (bad->start + bad->len) > BB_OFFSET(p[behind]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ * range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ * range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ * extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ * ranges will be split (ahead and after the overwritten range) and
++ * added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *extra)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(!overlap_front(bb, prev, bad));
++
++ if (BB_ACK(p[prev]) >= bad->ack)
++ return false;
++
++ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++ len = BB_END(p[prev]) - bad->start;
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 0;
++ else
++ *extra = 1;
++
++ bad->len = len;
++ } else {
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 1;
++ else
++ /*
++ * prev range will be split into two, beside the overwritten
++ * one, an extra slot needed from bad table.
++ */
++ *extra = 2;
++ }
++
++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++ return false;
++
++ return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int extra)
++{
++ u64 *p = bb->page;
++ sector_t orig_end = BB_END(p[prev]);
++ int orig_ack = BB_ACK(p[prev]);
++ int n = extra;
++
++ switch (extra) {
++ case 0:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++ bad->ack);
++ break;
++ case 1:
++ if (BB_OFFSET(p[prev]) == bad->start) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->len, bad->ack);
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start + bad->len,
++ orig_end - BB_END(p[prev]),
++ orig_ack);
++ } else {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ memmove(p + prev + 1 + n, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ }
++ break;
++ case 2:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ memmove(p + prev + 1 + n, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++ orig_end - BB_END(p[prev + 1]),
++ BB_ACK(p[prev]));
++ break;
++ default:
++ break;
++ }
++
++ return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(badblocks_full(bb));
++
++ len = min_t(sector_t, sectors, BB_MAX_LEN);
++ if (at < bb->count)
++ memmove(p + at + 1, p + at, (bb->count - at) * 8);
++ p[at] = BB_MAKE(s, len, ack);
++
++ return len;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch
new file mode 100644
index 0000000..31a7639
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch
@@ -0,0 +1,662 @@
+From b3bbd59d07b131df82410b615ed13a7c439bbd32 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 18:36:09 +0800
+Subject: [PATCH 3/6] badblocks: improvement badblocks_set() for multiple
+ ranges handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+ badblocks_set(bb, 32, 1, true);
+ badblocks_set(bb, 34, 1, true);
+ badblocks_set(bb, 36, 1, true);
+ badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+ 32 3
+ 36 1
+But the expected bad blocks table should be,
+ 32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+ blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be ackknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinationsinto to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 561 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 541 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index e85a7cd23aad..95dceed0da3c 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlaypped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restritctions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficut and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to habndle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ * prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ * block range.
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+ +-------------+
++ * | E1 | | E2 |
++ * +-------------+ +-------------+
++ * For this situation if the bad blocks table is not full, just allocate a
++ * free slot from the bad blocks table to mark the setting range S. The
++ * result is,
++ * +-------------+ +--------+ +-------------+
++ * | E1 | | S | | E2 |
++ * +-------------+ +--------+ +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ * range.
++ * 2.1) The setting range size < already set range size
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.1.2) If S is uncked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E.
++ * An extra slot from the bad blocks table will be allocated for S, and head
++ * of E will move to end of the inserted range E. The result is,
++ * +--------+----+
++ * | S | E |
++ * +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.2.2) If S is uncked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of
++ bad blocks range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.3) The setting range size > already set range size
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For such situation, the setting range S can be treated as two parts, the
++ * first part (S1) is as same size as the already set range E, the second
++ * part (S2) is the rest of setting range.
++ * +-------------+-----+ +-------------+ +-----+
++ * | S1 | S2 | | S1 | | S2 |
++ * +-------------+-----+ ===> +-------------+ +-----+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now we only focus on how to handle the setting range S1 and already set
++ * range E, which are already explained in 1.2), for the rest S2 it will be
++ * handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ * range.
++ * +-------------+
++ * | S |
++ * +-------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation, the setting range S can be divided into two parts, the
++ * first (S1) ends at the start LBA of already set range E, the second part
++ * (S2) starts exactly at a start LBA of the already set range E.
++ * +----+---------+ +----+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +----+---------+ ===> +----+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now only the first part S1 should be handled in this loop, which is in
++ * similar condition as 1). The rest part S2 has exact same start LBA address
++ * of the already set range E, they will be handled in next loop in one of
++ * situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ * range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ * blocks range E, like the following chart shows,
++ * +---------+
++ * | S |
++ * +---------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ * overwrite the overlapped range of E, the result is,
++ * +---+---------+
++ * | E | S |
++ * +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ * the following chart shows,
++ * +----+
++ * | S |
++ * +----+
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is also,
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ * inserted into middle of E and split previous range E into twp parts (E1
++ * and E2), the result is,
++ * +----+----+----+
++ * | E1 | S | E2 |
++ * +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ * blocks range E. The range S starts after the start LBA of range E, and
++ * ends after the end LBA of range E, as the following chart shows,
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation the range S can be divided into two parts, the first
++ * part (S1) ends at end range E, and the second part (S2) has rest range of
++ * origin S.
++ * +---------+---------+ +---------+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +---------+---------+ ===> +---------+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now in this loop the setting range S1 and already set range E can be
++ * handled as the situations 4), the rest range S2 will be handled in next
++ * loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ * blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ * range S and they are adjacent,
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can front merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ * range S right after already set range E into the bad blocks table. The
++ * result is,
++ * +--------+------+
++ * | E | S |
++ * +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ * +-------------------------------------------------------+
++ * | S |
++ * +-------------------------------------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+ +-----+ +-----+
++ * | E1 | | E2 | | E3 |
++ * +-----+ +-----+ +-----+
++ * In the above example, when the bad blocks table is full, inserting the
++ * first part of setting range S will fail because no more available slot
++ * can be allocated from bad blocks table. In this situation a proper
++ * setting method should be go though all the setting bad blocks range and
++ * look for chance to merge already set ranges into less ones. When there
++ * is available slot from bad blocks table, re-try again to handle more
++ * setting bad blocks ranges as many as possible.
++ * +------------------------+
++ * | S3 |
++ * +------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+-----+-----+---+-----+--+
++ * | S1 | S2 |
++ * +-----+-----+-----+---+-----+--+
++ * The above chart shows although the first part (S3) cannot be inserted due
++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ * can be merged with rest part of S into less range S1 and S2. Now there is
++ * 1 free slot in bad blocks table.
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * | S3 | S1 | S2 |
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * Since the bad blocks table is not full anymore, re-try again for the
++ * origin setting range S. Now the setting range S3 can be inserted into the
++ * bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ * In the following example, in bad blocks table, E1 is an acked bad blocks
++ * range and E2 is an unacked bad blocks range, therefore they are not able
++ * to merge into a larger range. The setting bad blocks range S is acked,
++ * therefore part of E2 can be overwritten by S.
++ * +--------+
++ * | S | acknowledged
++ * +--------+ S: 1
++ * +-------+-------------+ E1: 1
++ * | E1 | E2 | E2: 0
++ * +-------+-------------+
++ * With previosu simplified routines, after overwiting part of E2 with S,
++ * the bad blocks table should be (E3 is remaining part of E2 which is not
++ * overwritten by S),
++ * acknowledged
++ * +-------+--------+----+ S: 1
++ * | E1 | S | E3 | E1: 1
++ * +-------+--------+----+ E3: 0
++ * The above result is correct but not perfect. Range E1 and S in the bad
++ * blocks table are all acked, merging them into a larger one range may
++ * occupy less bad blocks table space and make badblocks_check() faster.
++ * Therefore in such situation, after overwiting range S, the previous range
++ * E1 should be checked for possible front combination. Then the ideal
++ * result can be,
++ * +----------------+----+ acknowledged
++ * | E1 | E3 | E1: 1
++ * +----------------+----+ E3: 0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ * range S and they are adjacent. Normally we don't need to care about this
++ * because front merge handles this while going though range S from head to
++ * tail, except for the tail part of range S. When the setting range S are
++ * fully handled, all the above simplified routine doesn't check whether the
++ * tail LBA of range S is adjacent to the next already set range and not able
++ * to them if they are mergeable.
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * For the above special stiuation, when the setting range S are all handled
++ * and the loop ends, an extra check is necessary for whether next already
++ * set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can behind merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ * S infront of the already set range E in the bad blocks table. The result
++ * is,
++ * +------+-------+
++ * | S | E |
++ * +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -390,6 +706,231 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ return len;
+ }
+
++static void badblocks_update_acked(struct badblocks *bb)
++{
++ u64 *p = bb->page;
++ int i;
++ bool unacked = false;
++
++ if (!bb->unacked_exist)
++ return;
++
++ for (i = 0; i < bb->count ; i++) {
++ if (!BB_ACK(p[i])) {
++ unacked = true;
++ break;
++ }
++ }
++
++ if (!unacked)
++ bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged)
++{
++ u64 *p;
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ int len = 0, added = 0;
++ int retried = 0, space_desired = 0;
++ int rv = 0;
++ unsigned long flags;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ /* round the start down, and the end up */
++ sector_t next = s + sectors;
++
++ rounddown(s, bb->shift);
++ roundup(next, bb->shift);
++ sectors = next - s;
++ }
++
++ write_seqlock_irqsave(&bb->lock, flags);
++
++ bad.orig_start = s;
++ bad.orig_len = sectors;
++ bad.ack = acknowledged;
++ p = bb->page;
++
++re_insert:
++ bad.start = s;
++ bad.len = sectors;
++ len = 0;
++
++ if (badblocks_empty(bb)) {
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start before all badblocks */
++ if (prev < 0) {
++ if (!badblocks_full(bb)) {
++ /* insert on the first */
++ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++ bad.len = BB_OFFSET(p[0]) - bad.start;
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* No sapce, try to merge */
++ if (overlap_behind(bb, &bad, 0)) {
++ if (can_merge_behind(bb, &bad, 0)) {
++ len = behind_merge(bb, &bad, 0);
++ added++;
++ } else {
++ len = min_t(sector_t,
++ BB_OFFSET(p[0]) - s, sectors);
++ space_desired = 1;
++ }
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* no table space and give up */
++ goto out;
++ }
++
++ /* in case p[prev-1] can be merged with p[prev] */
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ hint = prev;
++ } else {
++ int extra = 0;
++
++ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++ len = min_t(sector_t,
++ BB_END(p[prev]) - s, sectors);
++ hint = prev;
++ goto update_sectors;
++ }
++
++ len = front_overwrite(bb, prev, &bad, extra);
++ added++;
++ bb->count += extra;
++ hint = prev;
++
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ hint = prev - 1;
++ }
++ }
++ goto update_sectors;
++ }
++
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ /* if no space in table, still try to merge in the covered range */
++ if (badblocks_full(bb)) {
++ /* skip the cannot-merge range */
++ if (((prev + 1) < bb->count) &&
++ overlap_behind(bb, &bad, prev + 1) &&
++ ((s + sectors) >= BB_END(p[prev + 1]))) {
++ len = BB_END(p[prev + 1]) - s;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* no retry any more */
++ len = sectors;
++ space_desired = 1;
++ hint = -1;
++ goto update_sectors;
++ }
++
++ /* cannot merge and there is space in bad table */
++ if ((prev + 1) < bb->count &&
++ overlap_behind(bb, &bad, prev + 1))
++ bad.len = min_t(sector_t,
++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++ len = insert_at(bb, prev + 1, &bad);
++ bb->count++;
++ added++;
++ hint = prev + 1;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_insert;
++
++ WARN_ON(sectors < 0);
++
++ /* Check whether the following already set range can be merged */
++ if ((prev + 1) < bb->count &&
++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++ BB_ACK(p[prev]));
++
++ if ((prev + 2) < bb->count)
++ memmove(p + prev + 1, p + prev + 2,
++ (bb->count - (prev + 2)) * 8);
++ bb->count--;
++ }
++
++ if (space_desired && !badblocks_full(bb)) {
++ s = bad.orig_start;
++ sectors = bad.orig_len;
++ space_desired = 0;
++ if (retried++ < 3)
++ goto re_insert;
++ }
++
++out:
++ if (added) {
++ set_changed(bb);
++
++ if (!acknowledged)
++ bb->unacked_exist = 1;
++ else
++ badblocks_update_acked(bb);
++ }
++
++ write_sequnlock_irqrestore(&bb->lock, flags);
++
++ if (!added)
++ rv = 1;
++
++ return rv;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+@@ -499,26 +1040,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+- u64 *p = bb->page;
+- int i;
+- bool unacked = false;
+-
+- if (!bb->unacked_exist)
+- return;
+-
+- for (i = 0; i < bb->count ; i++) {
+- if (!BB_ACK(p[i])) {
+- unacked = true;
+- break;
+- }
+- }
+-
+- if (!unacked)
+- bb->unacked_exist = 0;
+-}
+-
+ /**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch
new file mode 100644
index 0000000..4cbfd5e
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch
@@ -0,0 +1,401 @@
+From b75e0792f127a99f068d635421ffac52843b488c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH 4/6] badblocks: improve badblocks_clear() for multiple ranges
+ handling
+
+With the foundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+ Clear the bad block range from bad block table which is front
+ overlapped with the clearing range.
+- front_splitting_clear()
+ Handle the condition that the clearing range hits middle of an
+ already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are categried
+and how all the bad block range clearing conditions are handled by these
+five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 327 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 95dceed0da3c..b9a4cd64b840 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearning range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to beconsidered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ * table.
++ * +-----+ | +-----+ | +-----+
++ * | C | | | C | | | C |
++ * +-----+ or +-----+ or +-----+
++ * +---+ | +----+ +----+ | +---+
++ * | E | | | E1 | | E2 | | | E |
++ * +---+ | +----+ +----+ | +---+
++ * For the above situations, no bad block to be cleared and no failure
++ * happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ * the bad block table.
++ * +---+
++ * | C |
++ * +---+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * In this situation if the bad block table is not full, the range E will be
++ * split into two ranges E1 and E2. The result is,
++ * +------+ +------+
++ * | E1 | | E2 |
++ * +------+ +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ * from the bad block table.
++ * 3.1) Partially covered at head part
++ * +------------+
++ * | C |
++ * +------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation, the overlapped already set range will update the
++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ * item deleted from bad block table. The result is,
++ * +----+
++ * | E1 |
++ * +----+
++ * 3.2) Exact fully covered
++ * +-----------------+
++ * | C |
++ * +-----------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation the whole bad blocks range E will be cleared and its
++ * corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ * range.
++ * +-------+
++ * | C |
++ * +-------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For the above situation, the already set range E is updated to shrink its
++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ * The result is,
++ * +---------+
++ * | E |
++ * +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ * range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part ends at the start LBA of range E, and the second part starts at
++ * same LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ * handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part C1 ends at same end LBA of range E, and the second part starts
++ * at end LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part clearing range C1 can be handled as condition 4), and
++ * the second part clearing range C2 can be handled as condition 1) in next
++ * loop.
++ *
++ * All bad blocks range clearing can be simplified into the above 5 situations
++ * by only handling the head part of the clearing range in each run of the
++ * while-loop. The idea is similar to bad blocks range setting but much
++ * simpler.
+ */
+
+ /*
+@@ -931,6 +1048,216 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ return rv;
+ }
+
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *deleted)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int cleared = 0;
++
++ *deleted = 0;
++ if (s == BB_OFFSET(p[prev])) {
++ if (BB_LEN(p[prev]) > sectors) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++ BB_LEN(p[prev]) - sectors,
++ BB_ACK(p[prev]));
++ cleared = sectors;
++ } else {
++ /* BB_LEN(p[prev]) <= sectors */
++ cleared = BB_LEN(p[prev]);
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ *deleted = 1;
++ }
++ } else if (s > BB_OFFSET(p[prev])) {
++ if (BB_END(p[prev]) <= (s + sectors)) {
++ cleared = BB_END(p[prev]) - s;
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ } else {
++ /* Splitting is handled in front_splitting_clear() */
++ BUG();
++ }
++ }
++
++ return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ u64 end = BB_END(p[prev]);
++ int ack = BB_ACK(p[prev]);
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ ack);
++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++ return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ int len = 0, cleared = 0;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ sector_t target;
++
++ /* When clearing we round the start up and the end down.
++ * This should not matter as the shift should align with
++ * the block size and no rounding should ever be needed.
++ * However it is better the think a block is bad when it
++ * isn't than to think a block is not bad when it is.
++ */
++ target = s + sectors;
++ roundup(s, bb->shift);
++ rounddown(target, bb->shift);
++ sectors = target - s;
++ }
++
++ write_seqlock_irq(&bb->lock);
++
++ bad.orig_start = s;
++ bad.orig_len = sectors;
++ bad.ack = true;
++ p = bb->page;
++
++re_clear:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* Start before all badblocks */
++ if (prev < 0) {
++ if (overlap_behind(bb, &bad, 0)) {
++ len = BB_OFFSET(p[0]) - s;
++ hint = prev;
++ } else {
++ len = sectors;
++ }
++ /*
++ * Both situations are to clear non-bad range,
++ * should be treated as successful
++ */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Clear will split a bad record but the table is full */
++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + sectors))) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if ((BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + bad.len))) {
++ /* Splitting */
++ if ((bb->count + 1) < MAX_BADBLOCKS) {
++ len = front_splitting_clear(bb, prev, &bad);
++ bb->count += 1;
++ cleared++;
++ } else {
++ /* No space to split, give up */
++ len = sectors;
++ }
++ } else {
++ int deleted = 0;
++
++ len = front_clear(bb, prev, &bad, &deleted);
++ bb->count -= deleted;
++ cleared++;
++ hint = prev;
++ }
++
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Not cover any badblocks range in the table */
++ len = sectors;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_clear;
++
++ WARN_ON(sectors < 0);
++
++ if (cleared) {
++ badblocks_update_acked(bb);
++ set_changed(bb);
++ }
++
++ write_sequnlock_irq(&bb->lock);
++
++ if (!cleared)
++ rv = 1;
++
++ return rv;
++}
++
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch
new file mode 100644
index 0000000..6be1249
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch
@@ -0,0 +1,177 @@
+From 09092ea11f2a8d319ac57865031190f153d159ae Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 09:27:06 +0800
+Subject: [PATCH 5/6] badblocks: improve badblocks_check() for multiple ranges
+ handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+ +------------------------------------+
+ | C |
+ +------------------------------------+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+The improved badblocks_check() algorithm will divid checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+ Index No. Start Len Ack
+ 0 400 20 1
+ 1 500 50 1
+ 2 650 20 0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+ Checking Start, Len Return Value first_bad bad_sectors
+ 100, 100 0 N/A N/A
+ 100, 310 1 400 10
+ 100, 440 1 400 10
+ 100, 540 1 400 10
+ 100, 600 -1 400 10
+ 100, 800 -1 400 10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 99 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index b9a4cd64b840..5a1ac35b924a 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1257,6 +1257,105 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ return rv;
+ }
+
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors)
++{
++ int unacked_badblocks, acked_badblocks;
++ int prev = -1, hint = -1, set = 0;
++ struct badblocks_context bad;
++ unsigned int seq;
++ int len, rv;
++ u64 *p;
++
++ WARN_ON(bb->shift < 0 || sectors == 0);
++
++ if (bb->shift > 0) {
++ sector_t target;
++
++ /* round the start down, and the end up */
++ target = s + sectors;
++ rounddown(s, bb->shift);
++ roundup(target, bb->shift);
++ sectors = target - s;
++ }
++
++retry:
++ seq = read_seqbegin(&bb->lock);
++
++ bad.orig_start = s;
++ bad.orig_len = sectors;
++ p = bb->page;
++ unacked_badblocks = 0;
++ acked_badblocks = 0;
++
++re_check:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (BB_ACK(p[prev]))
++ acked_badblocks++;
++ else
++ unacked_badblocks++;
++
++ if (BB_END(p[prev]) >= (s + sectors))
++ len = sectors;
++ else
++ len = BB_END(p[prev]) - s;
++
++ if (set == 0) {
++ *first_bad = BB_OFFSET(p[prev]);
++ *bad_sectors = BB_LEN(p[prev]);
++ set = 1;
++ }
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* not cover any badblocks range in the table */
++ len = sectors;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_check;
++
++ WARN_ON(sectors < 0);
++
++ if (unacked_badblocks > 0)
++ rv = -1;
++ else if (acked_badblocks > 0)
++ rv = 1;
++ else
++ rv = 0;
++
++ if (read_seqretry(&bb->lock, seq))
++ goto retry;
++
++ return rv;
++}
+
+ /**
+ * badblocks_check() - check a given range for bad sectors
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch
new file mode 100644
index 0000000..6d07398
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch
@@ -0,0 +1,364 @@
+From f81bac5e10aa50c8245c605c363f7d4de21e318a Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH 6/6] badblocks: switch to the improved badblock handling code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 5a1ac35b924a..5ab03cfdc0b7 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1394,75 +1394,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+ {
+- int hi;
+- int lo;
+- u64 *p = bb->page;
+- int rv;
+- sector_t target = s + sectors;
+- unsigned seq;
+-
+- if (bb->shift > 0) {
+- /* round the start down, and the end up */
+- s >>= bb->shift;
+- target += (1<<bb->shift) - 1;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+- /* 'target' is now the first block after the bad range */
+-
+-retry:
+- seq = read_seqbegin(&bb->lock);
+- lo = 0;
+- rv = 0;
+- hi = bb->count;
+-
+- /* Binary search between lo and hi for 'target'
+- * i.e. for the last range that starts before 'target'
+- */
+- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+- * are known not to be the last range before target.
+- * VARIANT: hi-lo is the number of possible
+- * ranges, and decreases until it reaches 1
+- */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- /* This could still be the one, earlier ranges
+- * could not.
+- */
+- lo = mid;
+- else
+- /* This and later ranges are definitely out. */
+- hi = mid;
+- }
+- /* 'lo' might be the last that started before target, but 'hi' isn't */
+- if (hi > lo) {
+- /* need to check all range that end after 's' to see if
+- * any are unacknowledged.
+- */
+- while (lo >= 0 &&
+- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+- if (BB_OFFSET(p[lo]) < target) {
+- /* starts before the end, and finishes after
+- * the start, so they must overlap
+- */
+- if (rv != -1 && BB_ACK(p[lo]))
+- rv = 1;
+- else
+- rv = -1;
+- *first_bad = BB_OFFSET(p[lo]);
+- *bad_sectors = BB_LEN(p[lo]);
+- }
+- lo--;
+- }
+- }
+-
+- if (read_seqretry(&bb->lock, seq))
+- goto retry;
+-
+- return rv;
++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+@@ -1484,154 +1416,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+ {
+- u64 *p;
+- int lo, hi;
+- int rv = 0;
+- unsigned long flags;
+-
+- if (bb->shift < 0)
+- /* badblocks are disabled */
+- return 1;
+-
+- if (bb->shift) {
+- /* round the start down, and the end up */
+- sector_t next = s + sectors;
+-
+- s >>= bb->shift;
+- next += (1<<bb->shift) - 1;
+- next >>= bb->shift;
+- sectors = next - s;
+- }
+-
+- write_seqlock_irqsave(&bb->lock, flags);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts at-or-before 's' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a <= s)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo && BB_OFFSET(p[lo]) > s)
+- hi = lo;
+-
+- if (hi > lo) {
+- /* we found a range that might merge with the start
+- * of our new range
+- */
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t e = a + BB_LEN(p[lo]);
+- int ack = BB_ACK(p[lo]);
+-
+- if (e >= s) {
+- /* Yes, we can merge with a previous range */
+- if (s == a && s + sectors >= e)
+- /* new range covers old */
+- ack = acknowledged;
+- else
+- ack = ack && acknowledged;
+-
+- if (e < s + sectors)
+- e = s + sectors;
+- if (e - a <= BB_MAX_LEN) {
+- p[lo] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- /* does not all fit in one range,
+- * make p[lo] maximal
+- */
+- if (BB_LEN(p[lo]) != BB_MAX_LEN)
+- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- }
+- }
+- if (sectors && hi < bb->count) {
+- /* 'hi' points to the first range that starts after 's'.
+- * Maybe we can merge with the start of that range
+- */
+- sector_t a = BB_OFFSET(p[hi]);
+- sector_t e = a + BB_LEN(p[hi]);
+- int ack = BB_ACK(p[hi]);
+-
+- if (a <= s + sectors) {
+- /* merging is possible */
+- if (e <= s + sectors) {
+- /* full overlap */
+- e = s + sectors;
+- ack = acknowledged;
+- } else
+- ack = ack && acknowledged;
+-
+- a = s;
+- if (e - a <= BB_MAX_LEN) {
+- p[hi] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- lo = hi;
+- hi++;
+- }
+- }
+- if (sectors == 0 && hi < bb->count) {
+- /* we might be able to combine lo and hi */
+- /* Note: 's' is at the end of 'lo' */
+- sector_t a = BB_OFFSET(p[hi]);
+- int lolen = BB_LEN(p[lo]);
+- int hilen = BB_LEN(p[hi]);
+- int newlen = lolen + hilen - (s - a);
+-
+- if (s >= a && newlen < BB_MAX_LEN) {
+- /* yes, we can combine them */
+- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+- memmove(p + hi, p + hi + 1,
+- (bb->count - hi - 1) * 8);
+- bb->count--;
+- }
+- }
+- while (sectors) {
+- /* didn't merge (it all).
+- * Need to add a range just before 'hi'
+- */
+- if (bb->count >= MAX_BADBLOCKS) {
+- /* No room for more */
+- rv = 1;
+- break;
+- } else {
+- int this_sectors = sectors;
+-
+- memmove(p + hi + 1, p + hi,
+- (bb->count - hi) * 8);
+- bb->count++;
+-
+- if (this_sectors > BB_MAX_LEN)
+- this_sectors = BB_MAX_LEN;
+- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+- sectors -= this_sectors;
+- s += this_sectors;
+- }
+- }
+-
+- bb->changed = 1;
+- if (!acknowledged)
+- bb->unacked_exist = 1;
+- else
+- badblocks_update_acked(bb);
+- write_sequnlock_irqrestore(&bb->lock, flags);
+-
+- return rv;
++ return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+
+@@ -1651,96 +1436,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+ */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+- u64 *p;
+- int lo, hi;
+- sector_t target = s + sectors;
+- int rv = 0;
+-
+- if (bb->shift > 0) {
+- /* When clearing we round the start up and the end down.
+- * This should not matter as the shift should align with
+- * the block size and no rounding should ever be needed.
+- * However it is better the think a block is bad when it
+- * isn't than to think a block is not bad when it is.
+- */
+- s += (1<<bb->shift) - 1;
+- s >>= bb->shift;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+-
+- write_seqlock_irq(&bb->lock);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts before 'target' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo) {
+- /* p[lo] is the last range that could overlap the
+- * current range. Earlier ranges could also overlap,
+- * but only this one can overlap the end of the range.
+- */
+- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* Partial overlap, leave the tail of this range */
+- int ack = BB_ACK(p[lo]);
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t end = a + BB_LEN(p[lo]);
+-
+- if (a < s) {
+- /* we need to split this range */
+- if (bb->count >= MAX_BADBLOCKS) {
+- rv = -ENOSPC;
+- goto out;
+- }
+- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+- bb->count++;
+- p[lo] = BB_MAKE(a, s-a, ack);
+- lo++;
+- }
+- p[lo] = BB_MAKE(target, end - target, ack);
+- /* there is no longer an overlap */
+- hi = lo;
+- lo--;
+- }
+- while (lo >= 0 &&
+- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* This range does overlap */
+- if (BB_OFFSET(p[lo]) < s) {
+- /* Keep the early parts of this range. */
+- int ack = BB_ACK(p[lo]);
+- sector_t start = BB_OFFSET(p[lo]);
+-
+- p[lo] = BB_MAKE(start, s - start, ack);
+- /* now low doesn't overlap, so.. */
+- break;
+- }
+- lo--;
+- }
+- /* 'lo' is strictly before, 'hi' is strictly after,
+- * anything between needs to be discarded
+- */
+- if (hi - lo > 1) {
+- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+- bb->count -= (hi - lo - 1);
+- }
+- }
+-
+- badblocks_update_acked(bb);
+- bb->changed = 1;
+-out:
+- write_sequnlock_irq(&bb->lock);
+- return rv;
++ return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0000-cover-letter.patch b/for-test/badblocks/v4/v4-0000-cover-letter.patch
new file mode 100644
index 0000000..c02f896
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0000-cover-letter.patch
@@ -0,0 +1,70 @@
+From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 19:05:12 +0800
+Subject: [RESEND PATCH v4 0/6] badblocks improvement for multiple bad block ranges
+
+Hi Dan,
+
+This is the v4 effort to improve badblocks code APIs to handle multiple
+ranges in bad block table.
+
+Comparing to v3 series, the v4 series modification is for code review
+comments from Geliang Tang,
+- Declare local variables in reverse Xmas tree order.
+- Drop orig_start and orig_len from struct badblocks_context.
+- Fix typos in code comments.
+- in badblocks_set() avoid one unnecessary loop by setting variable
+ hint by prev (was prev - 1 in v3 series).
+
+There is NO in-memory or on-disk format change in the whole series, all
+existing API and data structures are consistent. This series just only
+improve the code algorithm to handle more corner cases, the interfaces
+are same and consistency to all existing callers (md raid and nvdimm
+drivers).
+
+The original motivation of the change is from the requirement from our
+customer, that current badblocks routines don't handle multiple ranges.
+For example if the bad block setting range covers multiple ranges from
+bad block table, only the first two bad block ranges merged and rested
+ranges are intact. The expected behavior should be all the covered
+ranges to be handled.
+
+All the patches are tested by modified user space code and the code
+logic works as expected. The modified user space testing code is
+provided in last patch. The testing code is an example how the improved
+code is tested.
+
+The whole change is divided into 6 patches to make the code review more
+clear and easier. If people prefer, I'd like to post a single large
+patch finally after the code review accomplished.
+
+Please review the code and response. Thank you all in advance.
+
+Coly Li
+
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+
+Coly Li (6):
+ badblocks: add more helper structure and routines in badblocks.h
+ badblocks: add helper routines for badblock ranges handling
+ badblocks: improvement badblocks_set() for multiple ranges handling
+ badblocks: improve badblocks_clear() for multiple ranges handling
+ badblocks: improve badblocks_check() for multiple ranges handling
+ badblocks: switch to the improved badblock handling code
+Coly Li (1):
+ test: user space code to test badblocks APIs
+
+ block/badblocks.c | 1602 ++++++++++++++++++++++++++++++-------
+ include/linux/badblocks.h | 30 +
+ 2 files changed, 1337 insertions(+), 295 deletions(-)
+
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch
new file mode 100644
index 0000000..f008556
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch
@@ -0,0 +1,91 @@
+From 4b3441cc612192914fdf57a8ae3f71479ff3793f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:29:38 +0800
+Subject: [PATCH v4 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+ This structure is used in improved badblocks code for bad table
+ iteration.
+- BB_END()
+ The macro to calculate end LBA of a bad range record from bad
+ table.
+- badblocks_full() and badblocks_empty()
+ The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+ The inline routines to set and clear 'changed' tag from struct
+ badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..670f2dae692f 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x) (!!((x) & BB_ACK_MASK))
++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,12 @@ struct badblocks {
+ sector_t size; /* in sectors */
+ };
+
++struct badblocks_context {
++ sector_t start;
++ sector_t len;
++ int ack;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ }
+ badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++ return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++ return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++ if (bb->changed != 1)
++ bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++ if (bb->changed != 0)
++ bb->changed = 0;
++}
++
+ #endif
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
new file mode 100644
index 0000000..46116bb
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
@@ -0,0 +1,457 @@
+From 69aa03e6aa9eb441a3b4bc7c3d017c064d6d821b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH v4 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+ range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+ manipulating range and existing bad block range from bad table.
+ - can_merge_behind()
+ Return 'true' if the manipulating range can backward merge with the
+ bad block range.
+ - can_merge_front()
+ Return 'true' if the manipulating range can forward merge with the
+ bad block range.
+ - can_combine_front()
+ Return 'true' if two adjacent bad block ranges before the
+ manipulating range can be merged.
+ - overlap_front()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range in front of its range.
+ - overlap_behind()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range behind its range.
+ - can_front_overwrite()
+ Return 'true' if the manipulating range can forward overwrite the
+ bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+ block table. Different routine is called with the specific relative
+ layout between the manipulating range and other bad block range in the
+ bad block table.
+ - behind_merge()
+ Merge the manipulating range with the bad block range behind its
+ range, and return the number of merged length in unit of sector.
+ - front_merge()
+ Merge the manipulating range with the bad block range in front of
+ its range, and return the number of merged length in unit of sector.
+ - front_combine()
+ Combine the two adjacent bad block ranges before the manipulating
+ range into a larger one.
+ - front_overwrite()
+ Overwrite partial of whole bad block range which is in front of the
+ manipulating range. The overwrite may split existing bad block range
+ and generate more bad block ranges into the bad block table.
+ - insert_at()
+ Insert the manipulating range at a specific location in the bad
+ block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 374 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..e216c6791b4b 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,380 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++ int hint_end = hint + 2;
++ u64 *p = bb->page;
++ int ret = -1;
++
++ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++ (BB_OFFSET(p[hint]) <= s)) {
++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++ ret = hint;
++ break;
++ }
++ hint++;
++ }
++
++ return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++ int hint)
++{
++ sector_t s = bad->start;
++ int ret = -1;
++ int lo, hi;
++ u64 *p;
++
++ if (!bb->count)
++ goto out;
++
++ if (hint >= 0) {
++ ret = prev_by_hint(bb, s, hint);
++ if (ret >= 0)
++ goto out;
++ }
++
++ lo = 0;
++ hi = bb->count;
++ p = bb->page;
++
++ while (hi - lo > 1) {
++ int mid = (lo + hi)/2;
++ sector_t a = BB_OFFSET(p[mid]);
++
++ if (a <= s)
++ lo = mid;
++ else
++ hi = mid;
++ }
++
++ if (BB_OFFSET(p[lo]) <= s)
++ ret = lo;
++out:
++ return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++
++ if ((s <= BB_OFFSET(p[behind])) &&
++ ((s + sectors) >= BB_OFFSET(p[behind])) &&
++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++ BB_ACK(p[behind]) == ack)
++ return true;
++ return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_OFFSET(p[behind]));
++ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++ if (s < BB_OFFSET(p[behind])) {
++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, ack);
++ } else {
++ merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++ }
++
++ WARN_ON(merged == 0);
++
++ return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++
++ if (BB_ACK(p[prev]) == ack &&
++ (s < BB_END(p[prev]) ||
++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++ return true;
++ return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_END(p[prev]));
++
++ if (s < BB_END(p[prev])) {
++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++ } else {
++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++ if ((prev + 1) < bb->count &&
++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++ }
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + merged, ack);
++ }
++
++ return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if ((prev > 0) &&
++ (BB_OFFSET(p[prev]) == bad->start) &&
++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++ return true;
++ return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++ u64 *p = bb->page;
++
++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++ BB_ACK(p[prev]));
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if (bad->start >= BB_OFFSET(p[front]) &&
++ bad->start < BB_END(p[front]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ u64 *p = bb->page;
++
++ if (bad->start < BB_OFFSET(p[behind]) &&
++ (bad->start + bad->len) > BB_OFFSET(p[behind]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ * range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ * range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ * extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ * ranges will be split (ahead and after the overwritten range) and
++ * added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *extra)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(!overlap_front(bb, prev, bad));
++
++ if (BB_ACK(p[prev]) >= bad->ack)
++ return false;
++
++ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++ len = BB_END(p[prev]) - bad->start;
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 0;
++ else
++ *extra = 1;
++
++ bad->len = len;
++ } else {
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 1;
++ else
++ /*
++ * prev range will be split into two, beside the overwritten
++ * one, an extra slot needed from bad table.
++ */
++ *extra = 2;
++ }
++
++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++ return false;
++
++ return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int extra)
++{
++ u64 *p = bb->page;
++ sector_t orig_end = BB_END(p[prev]);
++ int orig_ack = BB_ACK(p[prev]);
++ int n = extra;
++
++ switch (extra) {
++ case 0:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++ bad->ack);
++ break;
++ case 1:
++ if (BB_OFFSET(p[prev]) == bad->start) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->len, bad->ack);
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start + bad->len,
++ orig_end - BB_END(p[prev]),
++ orig_ack);
++ } else {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ memmove(p + prev + 1 + n, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ }
++ break;
++ case 2:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ memmove(p + prev + 1 + n, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++ orig_end - BB_END(p[prev + 1]),
++ BB_ACK(p[prev]));
++ break;
++ default:
++ break;
++ }
++
++ return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ int ack = bad->ack;
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(badblocks_full(bb));
++
++ len = min_t(sector_t, sectors, BB_MAX_LEN);
++ if (at < bb->count)
++ memmove(p + at + 1, p + at, (bb->count - at) * 8);
++ p[at] = BB_MAKE(s, len, ack);
++
++ return len;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch
new file mode 100644
index 0000000..cd732d0
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch
@@ -0,0 +1,661 @@
+From c6d337537fae982c4d24ce626436e32a2f71e5f8 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:57:50 +0800
+Subject: [PATCH v4 3/6] badblocks: improve badblocks_set() for multiple ranges handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+ badblocks_set(bb, 32, 1, true);
+ badblocks_set(bb, 34, 1, true);
+ badblocks_set(bb, 36, 1, true);
+ badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+ 32 3
+ 36 1
+But the expected bad blocks table should be,
+ 32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+ blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be acknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinations into to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 540 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index e216c6791b4b..13eaad18be15 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ * prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ * block range.
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+ +-------------+
++ * | E1 | | E2 |
++ * +-------------+ +-------------+
++ * For this situation if the bad blocks table is not full, just allocate a
++ * free slot from the bad blocks table to mark the setting range S. The
++ * result is,
++ * +-------------+ +--------+ +-------------+
++ * | E1 | | S | | E2 |
++ * +-------------+ +--------+ +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ * range.
++ * 2.1) The setting range size < already set range size
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ * An extra slot from the bad blocks table will be allocated for S, and head
++ * of E will move to end of the inserted range S. The result is,
++ * +--------+----+
++ * | S | E |
++ * +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++ bad blocks range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.3) The setting range size > already set range size
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For such situation, the setting range S can be treated as two parts, the
++ * first part (S1) is as same size as the already set range E, the second
++ * part (S2) is the rest of setting range.
++ * +-------------+-----+ +-------------+ +-----+
++ * | S1 | S2 | | S1 | | S2 |
++ * +-------------+-----+ ===> +-------------+ +-----+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now we only focus on how to handle the setting range S1 and already set
++ * range E, which are already explained in 2.2), for the rest S2 it will be
++ * handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ * range.
++ * +-------------+
++ * | S |
++ * +-------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation, the setting range S can be divided into two parts, the
++ * first (S1) ends at the start LBA of already set range E, the second part
++ * (S2) starts exactly at a start LBA of the already set range E.
++ * +----+---------+ +----+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +----+---------+ ===> +----+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now only the first part S1 should be handled in this loop, which is in
++ * similar condition as 1). The rest part S2 has exact same start LBA address
++ * of the already set range E, they will be handled in next loop in one of
++ * situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ * range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ * blocks range E, like the following chart shows,
++ * +---------+
++ * | S |
++ * +---------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ * overwrite the overlapped range of E, the result is,
++ * +---+---------+
++ * | E | S |
++ * +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ * the following chart shows,
++ * +----+
++ * | S |
++ * +----+
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is also,
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ * inserted into middle of E and split previous range E into twp parts (E1
++ * and E2), the result is,
++ * +----+----+----+
++ * | E1 | S | E2 |
++ * +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ * blocks range E. The range S starts after the start LBA of range E, and
++ * ends after the end LBA of range E, as the following chart shows,
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation the range S can be divided into two parts, the first
++ * part (S1) ends at end range E, and the second part (S2) has rest range of
++ * origin S.
++ * +---------+---------+ +---------+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +---------+---------+ ===> +---------+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now in this loop the setting range S1 and already set range E can be
++ * handled as the situations 4), the rest range S2 will be handled in next
++ * loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ * blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ * range S and they are adjacent,
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can front merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ * range S right after already set range E into the bad blocks table. The
++ * result is,
++ * +--------+------+
++ * | E | S |
++ * +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ * +-------------------------------------------------------+
++ * | S |
++ * +-------------------------------------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+ +-----+ +-----+
++ * | E1 | | E2 | | E3 |
++ * +-----+ +-----+ +-----+
++ * In the above example, when the bad blocks table is full, inserting the
++ * first part of setting range S will fail because no more available slot
++ * can be allocated from bad blocks table. In this situation a proper
++ * setting method should be go though all the setting bad blocks range and
++ * look for chance to merge already set ranges into less ones. When there
++ * is available slot from bad blocks table, re-try again to handle more
++ * setting bad blocks ranges as many as possible.
++ * +------------------------+
++ * | S3 |
++ * +------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+-----+-----+---+-----+--+
++ * | S1 | S2 |
++ * +-----+-----+-----+---+-----+--+
++ * The above chart shows although the first part (S3) cannot be inserted due
++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ * can be merged with rest part of S into less range S1 and S2. Now there is
++ * 1 free slot in bad blocks table.
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * | S3 | S1 | S2 |
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * Since the bad blocks table is not full anymore, re-try again for the
++ * origin setting range S. Now the setting range S3 can be inserted into the
++ * bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ * In the following example, in bad blocks table, E1 is an acked bad blocks
++ * range and E2 is an unacked bad blocks range, therefore they are not able
++ * to merge into a larger range. The setting bad blocks range S is acked,
++ * therefore part of E2 can be overwritten by S.
++ * +--------+
++ * | S | acknowledged
++ * +--------+ S: 1
++ * +-------+-------------+ E1: 1
++ * | E1 | E2 | E2: 0
++ * +-------+-------------+
++ * With previous simplified routines, after overwriting part of E2 with S,
++ * the bad blocks table should be (E3 is remaining part of E2 which is not
++ * overwritten by S),
++ * acknowledged
++ * +-------+--------+----+ S: 1
++ * | E1 | S | E3 | E1: 1
++ * +-------+--------+----+ E3: 0
++ * The above result is correct but not perfect. Range E1 and S in the bad
++ * blocks table are all acked, merging them into a larger one range may
++ * occupy less bad blocks table space and make badblocks_check() faster.
++ * Therefore in such situation, after overwriting range S, the previous range
++ * E1 should be checked for possible front combination. Then the ideal
++ * result can be,
++ * +----------------+----+ acknowledged
++ * | E1 | E3 | E1: 1
++ * +----------------+----+ E3: 0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ * range S and they are adjacent. Normally we don't need to care about this
++ * because front merge handles this while going though range S from head to
++ * tail, except for the tail part of range S. When the setting range S are
++ * fully handled, all the above simplified routine doesn't check whether the
++ * tail LBA of range S is adjacent to the next already set range and not able
++ * to them if they are mergeable.
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * For the above special situation, when the setting range S are all handled
++ * and the loop ends, an extra check is necessary for whether next already
++ * set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can behind merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ * S in front of the already set range E in the bad blocks table. The result
++ * is,
++ * +------+-------+
++ * | S | E |
++ * +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -390,6 +706,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ return len;
+ }
+
++static void badblocks_update_acked(struct badblocks *bb)
++{
++ bool unacked = false;
++ u64 *p = bb->page;
++ int i;
++
++ if (!bb->unacked_exist)
++ return;
++
++ for (i = 0; i < bb->count ; i++) {
++ if (!BB_ACK(p[i])) {
++ unacked = true;
++ break;
++ }
++ }
++
++ if (!unacked)
++ bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged)
++{
++ int retried = 0, space_desired = 0;
++ int orig_len, len = 0, added = 0;
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ sector_t orig_start;
++ unsigned long flags;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ /* round the start down, and the end up */
++ sector_t next = s + sectors;
++
++ rounddown(s, bb->shift);
++ roundup(next, bb->shift);
++ sectors = next - s;
++ }
++
++ write_seqlock_irqsave(&bb->lock, flags);
++
++ orig_start = s;
++ orig_len = sectors;
++ bad.ack = acknowledged;
++ p = bb->page;
++
++re_insert:
++ bad.start = s;
++ bad.len = sectors;
++ len = 0;
++
++ if (badblocks_empty(bb)) {
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start before all badblocks */
++ if (prev < 0) {
++ if (!badblocks_full(bb)) {
++ /* insert on the first */
++ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++ bad.len = BB_OFFSET(p[0]) - bad.start;
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* No sapce, try to merge */
++ if (overlap_behind(bb, &bad, 0)) {
++ if (can_merge_behind(bb, &bad, 0)) {
++ len = behind_merge(bb, &bad, 0);
++ added++;
++ } else {
++ len = min_t(sector_t,
++ BB_OFFSET(p[0]) - s, sectors);
++ space_desired = 1;
++ }
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* no table space and give up */
++ goto out;
++ }
++
++ /* in case p[prev-1] can be merged with p[prev] */
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ } else {
++ int extra = 0;
++
++ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++ len = min_t(sector_t,
++ BB_END(p[prev]) - s, sectors);
++ hint = prev;
++ goto update_sectors;
++ }
++
++ len = front_overwrite(bb, prev, &bad, extra);
++ added++;
++ bb->count += extra;
++
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ }
++ }
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ /* if no space in table, still try to merge in the covered range */
++ if (badblocks_full(bb)) {
++ /* skip the cannot-merge range */
++ if (((prev + 1) < bb->count) &&
++ overlap_behind(bb, &bad, prev + 1) &&
++ ((s + sectors) >= BB_END(p[prev + 1]))) {
++ len = BB_END(p[prev + 1]) - s;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* no retry any more */
++ len = sectors;
++ space_desired = 1;
++ hint = -1;
++ goto update_sectors;
++ }
++
++ /* cannot merge and there is space in bad table */
++ if ((prev + 1) < bb->count &&
++ overlap_behind(bb, &bad, prev + 1))
++ bad.len = min_t(sector_t,
++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++ len = insert_at(bb, prev + 1, &bad);
++ bb->count++;
++ added++;
++ hint = prev + 1;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_insert;
++
++ WARN_ON(sectors < 0);
++
++ /* Check whether the following already set range can be merged */
++ if ((prev + 1) < bb->count &&
++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++ BB_ACK(p[prev]));
++
++ if ((prev + 2) < bb->count)
++ memmove(p + prev + 1, p + prev + 2,
++ (bb->count - (prev + 2)) * 8);
++ bb->count--;
++ }
++
++ if (space_desired && !badblocks_full(bb)) {
++ s = orig_start;
++ sectors = orig_len;
++ space_desired = 0;
++ if (retried++ < 3)
++ goto re_insert;
++ }
++
++out:
++ if (added) {
++ set_changed(bb);
++
++ if (!acknowledged)
++ bb->unacked_exist = 1;
++ else
++ badblocks_update_acked(bb);
++ }
++
++ write_sequnlock_irqrestore(&bb->lock, flags);
++
++ if (!added)
++ rv = 1;
++
++ return rv;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+@@ -499,26 +1039,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+- u64 *p = bb->page;
+- int i;
+- bool unacked = false;
+-
+- if (!bb->unacked_exist)
+- return;
+-
+- for (i = 0; i < bb->count ; i++) {
+- if (!BB_ACK(p[i])) {
+- unacked = true;
+- break;
+- }
+- }
+-
+- if (!unacked)
+- bb->unacked_exist = 0;
+-}
+-
+ /**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
new file mode 100644
index 0000000..ad5cfc3
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
@@ -0,0 +1,399 @@
+From a7120f4e3a771de6f6c682798b0e9ebf3c6fcb49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH v4 4/6] badblocks: improve badblocks_clear() for multiple
+ ranges handling
+
+With the fundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+ Clear the bad block range from bad block table which is front
+ overlapped with the clearing range.
+- front_splitting_clear()
+ Handle the condition that the clearing range hits middle of an
+ already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are
+categrized and how all the bad block range clearing conditions are
+handled by these five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 325 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 13eaad18be15..c188b2e98140 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ * table.
++ * +-----+ | +-----+ | +-----+
++ * | C | | | C | | | C |
++ * +-----+ or +-----+ or +-----+
++ * +---+ | +----+ +----+ | +---+
++ * | E | | | E1 | | E2 | | | E |
++ * +---+ | +----+ +----+ | +---+
++ * For the above situations, no bad block to be cleared and no failure
++ * happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ * the bad block table.
++ * +---+
++ * | C |
++ * +---+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * In this situation if the bad block table is not full, the range E will be
++ * split into two ranges E1 and E2. The result is,
++ * +------+ +------+
++ * | E1 | | E2 |
++ * +------+ +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ * from the bad block table.
++ * 3.1) Partially covered at head part
++ * +------------+
++ * | C |
++ * +------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation, the overlapped already set range will update the
++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ * item deleted from bad block table. The result is,
++ * +----+
++ * | E1 |
++ * +----+
++ * 3.2) Exact fully covered
++ * +-----------------+
++ * | C |
++ * +-----------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation the whole bad blocks range E will be cleared and its
++ * corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ * range.
++ * +-------+
++ * | C |
++ * +-------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For the above situation, the already set range E is updated to shrink its
++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ * The result is,
++ * +---------+
++ * | E |
++ * +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ * range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part ends at the start LBA of range E, and the second part starts at
++ * same LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ * handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part C1 ends at same end LBA of range E, and the second part starts
++ * at end LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part clearing range C1 can be handled as condition 4), and
++ * the second part clearing range C2 can be handled as condition 1) in next
++ * loop.
++ *
++ * All bad blocks range clearing can be simplified into the above 5 situations
++ * by only handling the head part of the clearing range in each run of the
++ * while-loop. The idea is similar to bad blocks range setting but much
++ * simpler.
+ */
+
+ /*
+@@ -930,6 +1047,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ return rv;
+ }
+
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *deleted)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int cleared = 0;
++
++ *deleted = 0;
++ if (s == BB_OFFSET(p[prev])) {
++ if (BB_LEN(p[prev]) > sectors) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++ BB_LEN(p[prev]) - sectors,
++ BB_ACK(p[prev]));
++ cleared = sectors;
++ } else {
++ /* BB_LEN(p[prev]) <= sectors */
++ cleared = BB_LEN(p[prev]);
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ *deleted = 1;
++ }
++ } else if (s > BB_OFFSET(p[prev])) {
++ if (BB_END(p[prev]) <= (s + sectors)) {
++ cleared = BB_END(p[prev]) - s;
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ } else {
++ /* Splitting is handled in front_splitting_clear() */
++ BUG();
++ }
++ }
++
++ return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ u64 end = BB_END(p[prev]);
++ int ack = BB_ACK(p[prev]);
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ ack);
++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++ return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ int len = 0, cleared = 0;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ sector_t target;
++
++ /* When clearing we round the start up and the end down.
++ * This should not matter as the shift should align with
++ * the block size and no rounding should ever be needed.
++ * However it is better the think a block is bad when it
++ * isn't than to think a block is not bad when it is.
++ */
++ target = s + sectors;
++ roundup(s, bb->shift);
++ rounddown(target, bb->shift);
++ sectors = target - s;
++ }
++
++ write_seqlock_irq(&bb->lock);
++
++ bad.ack = true;
++ p = bb->page;
++
++re_clear:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* Start before all badblocks */
++ if (prev < 0) {
++ if (overlap_behind(bb, &bad, 0)) {
++ len = BB_OFFSET(p[0]) - s;
++ hint = prev;
++ } else {
++ len = sectors;
++ }
++ /*
++ * Both situations are to clear non-bad range,
++ * should be treated as successful
++ */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Clear will split a bad record but the table is full */
++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + sectors))) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if ((BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + bad.len))) {
++ /* Splitting */
++ if ((bb->count + 1) < MAX_BADBLOCKS) {
++ len = front_splitting_clear(bb, prev, &bad);
++ bb->count += 1;
++ cleared++;
++ } else {
++ /* No space to split, give up */
++ len = sectors;
++ }
++ } else {
++ int deleted = 0;
++
++ len = front_clear(bb, prev, &bad, &deleted);
++ bb->count -= deleted;
++ cleared++;
++ hint = prev;
++ }
++
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Not cover any badblocks range in the table */
++ len = sectors;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_clear;
++
++ WARN_ON(sectors < 0);
++
++ if (cleared) {
++ badblocks_update_acked(bb);
++ set_changed(bb);
++ }
++
++ write_sequnlock_irq(&bb->lock);
++
++ if (!cleared)
++ rv = 1;
++
++ return rv;
++}
++
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
new file mode 100644
index 0000000..e519560
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
@@ -0,0 +1,175 @@
+From 88b4c165ef9827f0febe7a527faea2a0d99feb66 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 16:13:35 +0800
+Subject: [PATCH v4 5/6] badblocks: improve badblocks_check() for multiple
+ ranges handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+ +------------------------------------+
+ | C |
+ +------------------------------------+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+The improved badblocks_check() algorithm will divide checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+ Index No. Start Len Ack
+ 0 400 20 1
+ 1 500 50 1
+ 2 650 20 0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+ Checking Start, Len Return Value first_bad bad_sectors
+ 100, 100 0 N/A N/A
+ 100, 310 1 400 10
+ 100, 440 1 400 10
+ 100, 540 1 400 10
+ 100, 600 -1 400 10
+ 100, 800 -1 400 10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 97 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index c188b2e98140..f16c54925275 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1254,6 +1254,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ return rv;
+ }
+
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors)
++{
++ int unacked_badblocks, acked_badblocks;
++ int prev = -1, hint = -1, set = 0;
++ struct badblocks_context bad;
++ unsigned int seq;
++ int len, rv;
++ u64 *p;
++
++ WARN_ON(bb->shift < 0 || sectors == 0);
++
++ if (bb->shift > 0) {
++ sector_t target;
++
++ /* round the start down, and the end up */
++ target = s + sectors;
++ rounddown(s, bb->shift);
++ roundup(target, bb->shift);
++ sectors = target - s;
++ }
++
++retry:
++ seq = read_seqbegin(&bb->lock);
++
++ p = bb->page;
++ unacked_badblocks = 0;
++ acked_badblocks = 0;
++
++re_check:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (BB_ACK(p[prev]))
++ acked_badblocks++;
++ else
++ unacked_badblocks++;
++
++ if (BB_END(p[prev]) >= (s + sectors))
++ len = sectors;
++ else
++ len = BB_END(p[prev]) - s;
++
++ if (set == 0) {
++ *first_bad = BB_OFFSET(p[prev]);
++ *bad_sectors = BB_LEN(p[prev]);
++ set = 1;
++ }
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* not cover any badblocks range in the table */
++ len = sectors;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_check;
++
++ WARN_ON(sectors < 0);
++
++ if (unacked_badblocks > 0)
++ rv = -1;
++ else if (acked_badblocks > 0)
++ rv = 1;
++ else
++ rv = 0;
++
++ if (read_seqretry(&bb->lock, seq))
++ goto retry;
++
++ return rv;
++}
+
+ /**
+ * badblocks_check() - check a given range for bad sectors
+--
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
new file mode 100644
index 0000000..17b7597
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
@@ -0,0 +1,365 @@
+From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH v4 6/6] badblocks: switch to the improved badblock handling
+ code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index f16c54925275..4838750811ca 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1389,75 +1389,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+ {
+- int hi;
+- int lo;
+- u64 *p = bb->page;
+- int rv;
+- sector_t target = s + sectors;
+- unsigned seq;
+-
+- if (bb->shift > 0) {
+- /* round the start down, and the end up */
+- s >>= bb->shift;
+- target += (1<<bb->shift) - 1;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+- /* 'target' is now the first block after the bad range */
+-
+-retry:
+- seq = read_seqbegin(&bb->lock);
+- lo = 0;
+- rv = 0;
+- hi = bb->count;
+-
+- /* Binary search between lo and hi for 'target'
+- * i.e. for the last range that starts before 'target'
+- */
+- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+- * are known not to be the last range before target.
+- * VARIANT: hi-lo is the number of possible
+- * ranges, and decreases until it reaches 1
+- */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- /* This could still be the one, earlier ranges
+- * could not.
+- */
+- lo = mid;
+- else
+- /* This and later ranges are definitely out. */
+- hi = mid;
+- }
+- /* 'lo' might be the last that started before target, but 'hi' isn't */
+- if (hi > lo) {
+- /* need to check all range that end after 's' to see if
+- * any are unacknowledged.
+- */
+- while (lo >= 0 &&
+- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+- if (BB_OFFSET(p[lo]) < target) {
+- /* starts before the end, and finishes after
+- * the start, so they must overlap
+- */
+- if (rv != -1 && BB_ACK(p[lo]))
+- rv = 1;
+- else
+- rv = -1;
+- *first_bad = BB_OFFSET(p[lo]);
+- *bad_sectors = BB_LEN(p[lo]);
+- }
+- lo--;
+- }
+- }
+-
+- if (read_seqretry(&bb->lock, seq))
+- goto retry;
+-
+- return rv;
++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+@@ -1479,154 +1411,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+ {
+- u64 *p;
+- int lo, hi;
+- int rv = 0;
+- unsigned long flags;
+-
+- if (bb->shift < 0)
+- /* badblocks are disabled */
+- return 1;
+-
+- if (bb->shift) {
+- /* round the start down, and the end up */
+- sector_t next = s + sectors;
+-
+- s >>= bb->shift;
+- next += (1<<bb->shift) - 1;
+- next >>= bb->shift;
+- sectors = next - s;
+- }
+-
+- write_seqlock_irqsave(&bb->lock, flags);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts at-or-before 's' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a <= s)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo && BB_OFFSET(p[lo]) > s)
+- hi = lo;
+-
+- if (hi > lo) {
+- /* we found a range that might merge with the start
+- * of our new range
+- */
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t e = a + BB_LEN(p[lo]);
+- int ack = BB_ACK(p[lo]);
+-
+- if (e >= s) {
+- /* Yes, we can merge with a previous range */
+- if (s == a && s + sectors >= e)
+- /* new range covers old */
+- ack = acknowledged;
+- else
+- ack = ack && acknowledged;
+-
+- if (e < s + sectors)
+- e = s + sectors;
+- if (e - a <= BB_MAX_LEN) {
+- p[lo] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- /* does not all fit in one range,
+- * make p[lo] maximal
+- */
+- if (BB_LEN(p[lo]) != BB_MAX_LEN)
+- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- }
+- }
+- if (sectors && hi < bb->count) {
+- /* 'hi' points to the first range that starts after 's'.
+- * Maybe we can merge with the start of that range
+- */
+- sector_t a = BB_OFFSET(p[hi]);
+- sector_t e = a + BB_LEN(p[hi]);
+- int ack = BB_ACK(p[hi]);
+-
+- if (a <= s + sectors) {
+- /* merging is possible */
+- if (e <= s + sectors) {
+- /* full overlap */
+- e = s + sectors;
+- ack = acknowledged;
+- } else
+- ack = ack && acknowledged;
+-
+- a = s;
+- if (e - a <= BB_MAX_LEN) {
+- p[hi] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- lo = hi;
+- hi++;
+- }
+- }
+- if (sectors == 0 && hi < bb->count) {
+- /* we might be able to combine lo and hi */
+- /* Note: 's' is at the end of 'lo' */
+- sector_t a = BB_OFFSET(p[hi]);
+- int lolen = BB_LEN(p[lo]);
+- int hilen = BB_LEN(p[hi]);
+- int newlen = lolen + hilen - (s - a);
+-
+- if (s >= a && newlen < BB_MAX_LEN) {
+- /* yes, we can combine them */
+- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+- memmove(p + hi, p + hi + 1,
+- (bb->count - hi - 1) * 8);
+- bb->count--;
+- }
+- }
+- while (sectors) {
+- /* didn't merge (it all).
+- * Need to add a range just before 'hi'
+- */
+- if (bb->count >= MAX_BADBLOCKS) {
+- /* No room for more */
+- rv = 1;
+- break;
+- } else {
+- int this_sectors = sectors;
+-
+- memmove(p + hi + 1, p + hi,
+- (bb->count - hi) * 8);
+- bb->count++;
+-
+- if (this_sectors > BB_MAX_LEN)
+- this_sectors = BB_MAX_LEN;
+- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+- sectors -= this_sectors;
+- s += this_sectors;
+- }
+- }
+-
+- bb->changed = 1;
+- if (!acknowledged)
+- bb->unacked_exist = 1;
+- else
+- badblocks_update_acked(bb);
+- write_sequnlock_irqrestore(&bb->lock, flags);
+-
+- return rv;
++ return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+
+@@ -1646,96 +1431,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+ */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+- u64 *p;
+- int lo, hi;
+- sector_t target = s + sectors;
+- int rv = 0;
+-
+- if (bb->shift > 0) {
+- /* When clearing we round the start up and the end down.
+- * This should not matter as the shift should align with
+- * the block size and no rounding should ever be needed.
+- * However it is better the think a block is bad when it
+- * isn't than to think a block is not bad when it is.
+- */
+- s += (1<<bb->shift) - 1;
+- s >>= bb->shift;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+-
+- write_seqlock_irq(&bb->lock);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts before 'target' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo) {
+- /* p[lo] is the last range that could overlap the
+- * current range. Earlier ranges could also overlap,
+- * but only this one can overlap the end of the range.
+- */
+- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* Partial overlap, leave the tail of this range */
+- int ack = BB_ACK(p[lo]);
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t end = a + BB_LEN(p[lo]);
+-
+- if (a < s) {
+- /* we need to split this range */
+- if (bb->count >= MAX_BADBLOCKS) {
+- rv = -ENOSPC;
+- goto out;
+- }
+- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+- bb->count++;
+- p[lo] = BB_MAKE(a, s-a, ack);
+- lo++;
+- }
+- p[lo] = BB_MAKE(target, end - target, ack);
+- /* there is no longer an overlap */
+- hi = lo;
+- lo--;
+- }
+- while (lo >= 0 &&
+- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* This range does overlap */
+- if (BB_OFFSET(p[lo]) < s) {
+- /* Keep the early parts of this range. */
+- int ack = BB_ACK(p[lo]);
+- sector_t start = BB_OFFSET(p[lo]);
+-
+- p[lo] = BB_MAKE(start, s - start, ack);
+- /* now low doesn't overlap, so.. */
+- break;
+- }
+- lo--;
+- }
+- /* 'lo' is strictly before, 'hi' is strictly after,
+- * anything between needs to be discarded
+- */
+- if (hi - lo > 1) {
+- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+- bb->count -= (hi - lo - 1);
+- }
+- }
+-
+- badblocks_update_acked(bb);
+- bb->changed = 1;
+-out:
+- write_sequnlock_irq(&bb->lock);
+- return rv;
++ return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+
+--
+2.31.1
+
diff --git a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch
index 091d4d3..c354234 100644
--- a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch
+++ b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch
@@ -255,19 +255,19 @@ index 0000000..ca52647
+ *
+ * When the caller of badblocks_set() wants to set a range of bad blocks, the
+ * setting range can be acked or unacked. And the setting range may merge,
-+ * overwrite, skip the overlaypped already set range, depends on who they are
++ * overwrite, skip the overlapped already set range, depends on who they are
+ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
+ * more complicated when the setting range covers multiple already set bad block
-+ * ranges, with restritctions of maximum length of each bad range and the bad
++ * ranges, with restrictions of maximum length of each bad range and the bad
+ * table space limitation.
+ *
-+ * It is difficut and unnecessary to take care of all the possible situations,
++ * It is difficult and unnecessary to take care of all the possible situations,
+ * for setting a large range of bad blocks, we can handle it by dividing the
+ * large range into smaller ones when encounter overlap, max range length or
+ * bad table full conditions. Every time only a smaller piece of the bad range
+ * is handled with a limited number of conditions how it is interacted with
+ * possible overlapped or adjacent already set bad block ranges. Then the hard
-+ * complicated problem can be much simpler to habndle in proper way.
++ * complicated problem can be much simpler to handle in proper way.
+ *
+ * When setting a range of bad blocks to the bad table, the simplified situations
+ * to be considered are, (The already set bad blocks ranges are naming with
@@ -301,12 +301,12 @@ index 0000000..ca52647
+ * +-------------+
+ * | S |
+ * +-------------+
-+ * 2.1.2) If S is uncked setting and E is acked, the setting will be dinied, and
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be dinied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
-+ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E.
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
+ * An extra slot from the bad blocks table will be allocated for S, and head
+ * of E will move to end of the inserted range E. The result is,
+ * +--------+----+
@@ -318,12 +318,12 @@ index 0000000..ca52647
+ * +-------------+
+ * | S |
+ * +-------------+
-+ * 2.2.2) If S is uncked setting and E is acked, the setting will be dinied, and
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be dinied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
-+ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
+ bad blocks range E. The result is,
+ * +-------------+
+ * | S |
@@ -378,7 +378,7 @@ index 0000000..ca52647
+ * +-------------+
+ * | E |
+ * +-------------+
-+ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +-------------+
+ * | S |
@@ -401,7 +401,7 @@ index 0000000..ca52647
+ * +--------------+
+ * | E |
+ * +--------------+
-+ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +--------------+
+ * | S |
@@ -504,7 +504,7 @@ index 0000000..ca52647
+ * +-------+-------------+ E1: 1
+ * | E1 | E2 | E2: 0
+ * +-------+-------------+
-+ * With previosu simplified routines, after overwiting part of E2 with S,
++ * With previous simplified routines, after overwriting part of E2 with S,
+ * the bad blocks table should be (E3 is remaining part of E2 which is not
+ * overwritten by S),
+ * acknowledged
@@ -514,7 +514,7 @@ index 0000000..ca52647
+ * The above result is correct but not perfect. Range E1 and S in the bad
+ * blocks table are all acked, merging them into a larger one range may
+ * occupy less bad blocks table space and make badblocks_check() faster.
-+ * Therefore in such situation, after overwiting range S, the previous range
++ * Therefore in such situation, after overwriting range S, the previous range
+ * E1 should be checked for possible front combination. Then the ideal
+ * result can be,
+ * +----------------+----+ acknowledged
@@ -533,7 +533,7 @@ index 0000000..ca52647
+ * +-------+
+ * | E |
+ * +-------+
-+ * For the above special stiuation, when the setting range S are all handled
++ * For the above special situation, when the setting range S are all handled
+ * and the loop ends, an extra check is necessary for whether next already
+ * set range E is right after S and mergeable.
+ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
@@ -543,7 +543,7 @@ index 0000000..ca52647
+ * | S |
+ * +--------------+
+ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
-+ * S infront of the already set range E in the bad blocks table. The result
++ * S in front of the already set range E in the bad blocks table. The result
+ * is,
+ * +------+-------+
+ * | S | E |
@@ -567,9 +567,9 @@ index 0000000..ca52647
+ *
+ * Clearing a bad blocks range from the bad block table has similar idea as
+ * setting does, but much more simpler. The only thing needs to be noticed is
-+ * when the clearning range hits middle of a bad block range, the existing bad
++ * when the clearing range hits middle of a bad block range, the existing bad
+ * block range will split into two, and one more item should be added into the
-+ * bad block table. The simplified situations to beconsidered are, (The already
++ * bad block table. The simplified situations to be considered are, (The already
+ * set bad blocks ranges in bad block table are naming with prefix E, and the
+ * clearing bad blocks range is naming with prefix C)
+ *
diff --git a/for-test/badblocks/v5/v5-0000-cover-letter.patch b/for-test/badblocks/v5/v5-0000-cover-letter.patch
new file mode 100644
index 0000000..efd498c
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0000-cover-letter.patch
@@ -0,0 +1,70 @@
+From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 10 Dec 2021 15:27:33 +0800
+Subject: [PATCH v5 0/7] badblocks improvement for multiple bad block ranges
+
+Hi folks,
+
+This is the v5 effort to improve badblocks code APIs to handle multiple
+ranges in bad block table.
+
+Comparing to previous v4 series, the changes in v5 series include,
+- Typos in code comments which are pointed out by Geliang Tang and
+ Wols Lists.
+- Drop extra local variables in helper routines which suggested by
+ Geliang Tang.
+- Change the user space testing code with all above changes.
+
+There is NO in-memory or on-disk format change in the whole series, all
+existing API and data structures are consistent. This series just only
+improve the code algorithm to handle more corner cases, the interfaces
+are same and consistency to all existing callers (md raid and nvdimm
+drivers).
+
+The original motivation of the change is from the requirement from our
+customer, that current badblocks routines don't handle multiple ranges.
+For example if the bad block setting range covers multiple ranges from
+bad block table, only the first two bad block ranges merged and rested
+ranges are intact. The expected behavior should be all the covered
+ranges to be handled.
+
+All the patches are tested by modified user space code and the code
+logic works as expected. The modified user space testing code is
+provided in the last patch. The testing code is an example how the
+improved code is tested.
+
+The whole change is divided into 6 patches to make the code review more
+clear and easier. If people prefer, I'd like to post a single large
+patch finally after the code review accomplished.
+
+Please review the code and response. Thank you all in advance.
+
+Coly Li
+
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+Cc: Wols Lists <antlists@youngman.org.uk>
+---
+
+Coly Li (6):
+ badblocks: add more helper structure and routines in badblocks.h
+ badblocks: add helper routines for badblock ranges handling
+ badblocks: improve badblocks_set() for multiple ranges handling
+ badblocks: improve badblocks_clear() for multiple ranges handling
+ badblocks: improve badblocks_check() for multiple ranges handling
+ badblocks: switch to the improved badblock handling code
+Coly Li (1):
+ test: user space code to test badblocks APIs
+
+ block/badblocks.c | 1604 ++++++++++++++++++++++++++++++-------
+ include/linux/badblocks.h | 30 +
+ 2 files changed, 1339 insertions(+), 295 deletions(-)
+
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch
new file mode 100644
index 0000000..d66b0c8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch
@@ -0,0 +1,91 @@
+From d5352d6d537923232aa274cc753366a7851a1f13 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:29:38 +0800
+Subject: [PATCH v5 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+ This structure is used in improved badblocks code for bad table
+ iteration.
+- BB_END()
+ The macro to calculate end LBA of a bad range record from bad
+ table.
+- badblocks_full() and badblocks_empty()
+ The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+ The inline routines to set and clear 'changed' tag from struct
+ badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..670f2dae692f 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x) (!!((x) & BB_ACK_MASK))
++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,12 @@ struct badblocks {
+ sector_t size; /* in sectors */
+ };
+
++struct badblocks_context {
++ sector_t start;
++ sector_t len;
++ int ack;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ }
+ badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++ return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++ return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++ if (bb->changed != 1)
++ bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++ if (bb->changed != 0)
++ bb->changed = 0;
++}
++
+ #endif
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
new file mode 100644
index 0000000..fc084aa
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
@@ -0,0 +1,459 @@
+From 2accaa280961524bc5eea98399906d199eea2568 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH v5 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+ range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+ manipulating range and existing bad block range from bad table.
+ - can_merge_behind()
+ Return 'true' if the manipulating range can backward merge with the
+ bad block range.
+ - can_merge_front()
+ Return 'true' if the manipulating range can forward merge with the
+ bad block range.
+ - can_combine_front()
+ Return 'true' if two adjacent bad block ranges before the
+ manipulating range can be merged.
+ - overlap_front()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range in front of its range.
+ - overlap_behind()
+ Return 'true' if the manipulating range exactly overlaps with the
+ bad block range behind its range.
+ - can_front_overwrite()
+ Return 'true' if the manipulating range can forward overwrite the
+ bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+ block table. Different routine is called with the specific relative
+ layout between the manipulating range and other bad block range in the
+ bad block table.
+ - behind_merge()
+ Merge the manipulating range with the bad block range behind its
+ range, and return the number of merged length in unit of sector.
+ - front_merge()
+ Merge the manipulating range with the bad block range in front of
+ its range, and return the number of merged length in unit of sector.
+ - front_combine()
+ Combine the two adjacent bad block ranges before the manipulating
+ range into a larger one.
+ - front_overwrite()
+ Overwrite partial of whole bad block range which is in front of the
+ manipulating range. The overwrite may split existing bad block range
+ and generate more bad block ranges into the bad block table.
+ - insert_at()
+ Insert the manipulating range at a specific location in the bad
+ block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 376 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..30958cc4469f 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,382 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++ int hint_end = hint + 2;
++ u64 *p = bb->page;
++ int ret = -1;
++
++ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++ (BB_OFFSET(p[hint]) <= s)) {
++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++ ret = hint;
++ break;
++ }
++ hint++;
++ }
++
++ return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++ int hint)
++{
++ sector_t s = bad->start;
++ int ret = -1;
++ int lo, hi;
++ u64 *p;
++
++ if (!bb->count)
++ goto out;
++
++ if (hint >= 0) {
++ ret = prev_by_hint(bb, s, hint);
++ if (ret >= 0)
++ goto out;
++ }
++
++ lo = 0;
++ hi = bb->count;
++ p = bb->page;
++
++ while (hi - lo > 1) {
++ int mid = (lo + hi)/2;
++ sector_t a = BB_OFFSET(p[mid]);
++
++ if (a <= s)
++ lo = mid;
++ else
++ hi = mid;
++ }
++
++ if (BB_OFFSET(p[lo]) <= s)
++ ret = lo;
++out:
++ return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++
++ if ((s <= BB_OFFSET(p[behind])) &&
++ ((s + sectors) >= BB_OFFSET(p[behind])) &&
++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++ BB_ACK(p[behind]) == bad->ack)
++ return true;
++ return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_OFFSET(p[behind]));
++ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++ if (s < BB_OFFSET(p[behind])) {
++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
++ } else {
++ merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++ }
++
++ WARN_ON(merged == 0);
++
++ return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++
++ if (BB_ACK(p[prev]) == bad->ack &&
++ (s < BB_END(p[prev]) ||
++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++ return true;
++ return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_END(p[prev]));
++
++ if (s < BB_END(p[prev])) {
++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++ } else {
++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++ if ((prev + 1) < bb->count &&
++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++ }
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + merged, bad->ack);
++ }
++
++ return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if ((prev > 0) &&
++ (BB_OFFSET(p[prev]) == bad->start) &&
++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++ return true;
++ return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++ u64 *p = bb->page;
++
++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++ BB_ACK(p[prev]));
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if (bad->start >= BB_OFFSET(p[front]) &&
++ bad->start < BB_END(p[front]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ u64 *p = bb->page;
++
++ if (bad->start < BB_OFFSET(p[behind]) &&
++ (bad->start + bad->len) > BB_OFFSET(p[behind]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ * range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ * range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ * extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ * ranges will be split (ahead and after the overwritten range) and
++ * added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *extra)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(!overlap_front(bb, prev, bad));
++
++ if (BB_ACK(p[prev]) >= bad->ack)
++ return false;
++
++ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++ len = BB_END(p[prev]) - bad->start;
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 0;
++ else
++ *extra = 1;
++
++ bad->len = len;
++ } else {
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 1;
++ else
++ /*
++ * prev range will be split into two, beside the overwritten
++ * one, an extra slot needed from bad table.
++ */
++ *extra = 2;
++ }
++
++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++ return false;
++
++ return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int extra)
++{
++ u64 *p = bb->page;
++ sector_t orig_end = BB_END(p[prev]);
++ int orig_ack = BB_ACK(p[prev]);
++
++ switch (extra) {
++ case 0:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++ bad->ack);
++ break;
++ case 1:
++ if (BB_OFFSET(p[prev]) == bad->start) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->len, bad->ack);
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start + bad->len,
++ orig_end - BB_END(p[prev]),
++ orig_ack);
++ } else {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ /*
++ * prev +2 -> prev + 1 + 1, which is for,
++ * 1) prev + 1: the slot index of the previous one
++ * 2) + 1: one more slot for extra being 1.
++ */
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ }
++ break;
++ case 2:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ /*
++ * prev + 3 -> prev + 1 + 2, which is for,
++ * 1) prev + 1: the slot index of the previous one
++ * 2) + 2: two more slots for extra being 2.
++ */
++ memmove(p + prev + 3, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++ orig_end - BB_END(p[prev + 1]),
++ BB_ACK(p[prev]));
++ break;
++ default:
++ break;
++ }
++
++ return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(badblocks_full(bb));
++
++ len = min_t(sector_t, bad->len, BB_MAX_LEN);
++ if (at < bb->count)
++ memmove(p + at + 1, p + at, (bb->count - at) * 8);
++ p[at] = BB_MAKE(bad->start, len, bad->ack);
++
++ return len;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch
new file mode 100644
index 0000000..d5e7ce8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch
@@ -0,0 +1,663 @@
+From cdb864aa796ef2e65a99561b50561c7beec8ab58 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:57:50 +0800
+Subject: [PATCH v5 3/6] badblocks: improve badblocks_set() for multiple ranges
+ handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+ badblocks_set(bb, 32, 1, true);
+ badblocks_set(bb, 34, 1, true);
+ badblocks_set(bb, 36, 1, true);
+ badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+ 32 3
+ 36 1
+But the expected bad blocks table should be,
+ 32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+ blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be acknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinations into to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+Cc: Wols Lists <antlists@youngman.org.uk>
+---
+ block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 540 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 30958cc4469f..f45f82646bb7 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ * prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ * block range.
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+ +-------------+
++ * | E1 | | E2 |
++ * +-------------+ +-------------+
++ * For this situation if the bad blocks table is not full, just allocate a
++ * free slot from the bad blocks table to mark the setting range S. The
++ * result is,
++ * +-------------+ +--------+ +-------------+
++ * | E1 | | S | | E2 |
++ * +-------------+ +--------+ +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ * range.
++ * 2.1) The setting range size < already set range size
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ * An extra slot from the bad blocks table will be allocated for S, and head
++ * of E will move to end of the inserted range S. The result is,
++ * +--------+----+
++ * | S | E |
++ * +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++ bad blocks range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.3) The setting range size > already set range size
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For such situation, the setting range S can be treated as two parts, the
++ * first part (S1) is as same size as the already set range E, the second
++ * part (S2) is the rest of setting range.
++ * +-------------+-----+ +-------------+ +-----+
++ * | S1 | S2 | | S1 | | S2 |
++ * +-------------+-----+ ===> +-------------+ +-----+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now we only focus on how to handle the setting range S1 and already set
++ * range E, which are already explained in 2.2), for the rest S2 it will be
++ * handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ * range.
++ * +-------------+
++ * | S |
++ * +-------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation, the setting range S can be divided into two parts, the
++ * first (S1) ends at the start LBA of already set range E, the second part
++ * (S2) starts exactly at a start LBA of the already set range E.
++ * +----+---------+ +----+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +----+---------+ ===> +----+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now only the first part S1 should be handled in this loop, which is in
++ * similar condition as 1). The rest part S2 has exact same start LBA address
++ * of the already set range E, they will be handled in next loop in one of
++ * situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ * range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ * blocks range E, like the following chart shows,
++ * +---------+
++ * | S |
++ * +---------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ * overwrite the overlapped range of E, the result is,
++ * +---+---------+
++ * | E | S |
++ * +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ * the following chart shows,
++ * +----+
++ * | S |
++ * +----+
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is also,
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ * inserted into middle of E and split previous range E into twp parts (E1
++ * and E2), the result is,
++ * +----+----+----+
++ * | E1 | S | E2 |
++ * +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ * blocks range E. The range S starts after the start LBA of range E, and
++ * ends after the end LBA of range E, as the following chart shows,
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation the range S can be divided into two parts, the first
++ * part (S1) ends at end range E, and the second part (S2) has rest range of
++ * origin S.
++ * +---------+---------+ +---------+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +---------+---------+ ===> +---------+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now in this loop the setting range S1 and already set range E can be
++ * handled as the situations 4), the rest range S2 will be handled in next
++ * loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ * blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ * range S and they are adjacent,
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can front merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ * range S right after already set range E into the bad blocks table. The
++ * result is,
++ * +--------+------+
++ * | E | S |
++ * +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ * +-------------------------------------------------------+
++ * | S |
++ * +-------------------------------------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+ +-----+ +-----+
++ * | E1 | | E2 | | E3 |
++ * +-----+ +-----+ +-----+
++ * In the above example, when the bad blocks table is full, inserting the
++ * first part of setting range S will fail because no more available slot
++ * can be allocated from bad blocks table. In this situation a proper
++ * setting method should be go though all the setting bad blocks range and
++ * look for chance to merge already set ranges into less ones. When there
++ * is available slot from bad blocks table, re-try again to handle more
++ * setting bad blocks ranges as many as possible.
++ * +------------------------+
++ * | S3 |
++ * +------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+-----+-----+---+-----+--+
++ * | S1 | S2 |
++ * +-----+-----+-----+---+-----+--+
++ * The above chart shows although the first part (S3) cannot be inserted due
++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ * can be merged with rest part of S into less range S1 and S2. Now there is
++ * 1 free slot in bad blocks table.
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * | S3 | S1 | S2 |
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * Since the bad blocks table is not full anymore, re-try again for the
++ * origin setting range S. Now the setting range S3 can be inserted into the
++ * bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ * In the following example, in bad blocks table, E1 is an acked bad blocks
++ * range and E2 is an unacked bad blocks range, therefore they are not able
++ * to merge into a larger range. The setting bad blocks range S is acked,
++ * therefore part of E2 can be overwritten by S.
++ * +--------+
++ * | S | acknowledged
++ * +--------+ S: 1
++ * +-------+-------------+ E1: 1
++ * | E1 | E2 | E2: 0
++ * +-------+-------------+
++ * With previous simplified routines, after overwriting part of E2 with S,
++ * the bad blocks table should be (E3 is remaining part of E2 which is not
++ * overwritten by S),
++ * acknowledged
++ * +-------+--------+----+ S: 1
++ * | E1 | S | E3 | E1: 1
++ * +-------+--------+----+ E3: 0
++ * The above result is correct but not perfect. Range E1 and S in the bad
++ * blocks table are all acked, merging them into a larger one range may
++ * occupy less bad blocks table space and make badblocks_check() faster.
++ * Therefore in such situation, after overwriting range S, the previous range
++ * E1 should be checked for possible front combination. Then the ideal
++ * result can be,
++ * +----------------+----+ acknowledged
++ * | E1 | E3 | E1: 1
++ * +----------------+----+ E3: 0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ * range S and they are adjacent. Normally we don't need to care about this
++ * because front merge handles this while going though range S from head to
++ * tail, except for the tail part of range S. When the setting range S are
++ * fully handled, all the above simplified routine doesn't check whether the
++ * tail LBA of range S is adjacent to the next already set range and not able
++ * to them if they are mergeable.
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * For the above special situation, when the setting range S are all handled
++ * and the loop ends, an extra check is necessary for whether next already
++ * set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can behind merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ * S in front of the already set range E in the bad blocks table. The result
++ * is,
++ * +------+-------+
++ * | S | E |
++ * +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -392,6 +708,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ return len;
+ }
+
++static void badblocks_update_acked(struct badblocks *bb)
++{
++ bool unacked = false;
++ u64 *p = bb->page;
++ int i;
++
++ if (!bb->unacked_exist)
++ return;
++
++ for (i = 0; i < bb->count ; i++) {
++ if (!BB_ACK(p[i])) {
++ unacked = true;
++ break;
++ }
++ }
++
++ if (!unacked)
++ bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged)
++{
++ int retried = 0, space_desired = 0;
++ int orig_len, len = 0, added = 0;
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ sector_t orig_start;
++ unsigned long flags;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ /* round the start down, and the end up */
++ sector_t next = s + sectors;
++
++ rounddown(s, bb->shift);
++ roundup(next, bb->shift);
++ sectors = next - s;
++ }
++
++ write_seqlock_irqsave(&bb->lock, flags);
++
++ orig_start = s;
++ orig_len = sectors;
++ bad.ack = acknowledged;
++ p = bb->page;
++
++re_insert:
++ bad.start = s;
++ bad.len = sectors;
++ len = 0;
++
++ if (badblocks_empty(bb)) {
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start before all badblocks */
++ if (prev < 0) {
++ if (!badblocks_full(bb)) {
++ /* insert on the first */
++ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++ bad.len = BB_OFFSET(p[0]) - bad.start;
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* No sapce, try to merge */
++ if (overlap_behind(bb, &bad, 0)) {
++ if (can_merge_behind(bb, &bad, 0)) {
++ len = behind_merge(bb, &bad, 0);
++ added++;
++ } else {
++ len = min_t(sector_t,
++ BB_OFFSET(p[0]) - s, sectors);
++ space_desired = 1;
++ }
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* no table space and give up */
++ goto out;
++ }
++
++ /* in case p[prev-1] can be merged with p[prev] */
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ } else {
++ int extra = 0;
++
++ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++ len = min_t(sector_t,
++ BB_END(p[prev]) - s, sectors);
++ hint = prev;
++ goto update_sectors;
++ }
++
++ len = front_overwrite(bb, prev, &bad, extra);
++ added++;
++ bb->count += extra;
++
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ }
++ }
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ /* if no space in table, still try to merge in the covered range */
++ if (badblocks_full(bb)) {
++ /* skip the cannot-merge range */
++ if (((prev + 1) < bb->count) &&
++ overlap_behind(bb, &bad, prev + 1) &&
++ ((s + sectors) >= BB_END(p[prev + 1]))) {
++ len = BB_END(p[prev + 1]) - s;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* no retry any more */
++ len = sectors;
++ space_desired = 1;
++ hint = -1;
++ goto update_sectors;
++ }
++
++ /* cannot merge and there is space in bad table */
++ if ((prev + 1) < bb->count &&
++ overlap_behind(bb, &bad, prev + 1))
++ bad.len = min_t(sector_t,
++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++ len = insert_at(bb, prev + 1, &bad);
++ bb->count++;
++ added++;
++ hint = prev + 1;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_insert;
++
++ WARN_ON(sectors < 0);
++
++ /* Check whether the following already set range can be merged */
++ if ((prev + 1) < bb->count &&
++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++ BB_ACK(p[prev]));
++
++ if ((prev + 2) < bb->count)
++ memmove(p + prev + 1, p + prev + 2,
++ (bb->count - (prev + 2)) * 8);
++ bb->count--;
++ }
++
++ if (space_desired && !badblocks_full(bb)) {
++ s = orig_start;
++ sectors = orig_len;
++ space_desired = 0;
++ if (retried++ < 3)
++ goto re_insert;
++ }
++
++out:
++ if (added) {
++ set_changed(bb);
++
++ if (!acknowledged)
++ bb->unacked_exist = 1;
++ else
++ badblocks_update_acked(bb);
++ }
++
++ write_sequnlock_irqrestore(&bb->lock, flags);
++
++ if (!added)
++ rv = 1;
++
++ return rv;
++}
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+@@ -501,26 +1041,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+- u64 *p = bb->page;
+- int i;
+- bool unacked = false;
+-
+- if (!bb->unacked_exist)
+- return;
+-
+- for (i = 0; i < bb->count ; i++) {
+- if (!BB_ACK(p[i])) {
+- unacked = true;
+- break;
+- }
+- }
+-
+- if (!unacked)
+- bb->unacked_exist = 0;
+-}
+-
+ /**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
new file mode 100644
index 0000000..e3c38b8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
@@ -0,0 +1,399 @@
+From ea2a8ebd59b23e8c12febd3bcf5bebea24d63461 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH v5 4/6] badblocks: improve badblocks_clear() for multiple
+ ranges handling
+
+With the fundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+ Clear the bad block range from bad block table which is front
+ overlapped with the clearing range.
+- front_splitting_clear()
+ Handle the condition that the clearing range hits middle of an
+ already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are
+categrized and how all the bad block range clearing conditions are
+handled by these five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 325 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index f45f82646bb7..3e1bb593a2bb 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ * table.
++ * +-----+ | +-----+ | +-----+
++ * | C | | | C | | | C |
++ * +-----+ or +-----+ or +-----+
++ * +---+ | +----+ +----+ | +---+
++ * | E | | | E1 | | E2 | | | E |
++ * +---+ | +----+ +----+ | +---+
++ * For the above situations, no bad block to be cleared and no failure
++ * happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ * the bad block table.
++ * +---+
++ * | C |
++ * +---+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * In this situation if the bad block table is not full, the range E will be
++ * split into two ranges E1 and E2. The result is,
++ * +------+ +------+
++ * | E1 | | E2 |
++ * +------+ +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ * from the bad block table.
++ * 3.1) Partially covered at head part
++ * +------------+
++ * | C |
++ * +------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation, the overlapped already set range will update the
++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ * item deleted from bad block table. The result is,
++ * +----+
++ * | E1 |
++ * +----+
++ * 3.2) Exact fully covered
++ * +-----------------+
++ * | C |
++ * +-----------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation the whole bad blocks range E will be cleared and its
++ * corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ * range.
++ * +-------+
++ * | C |
++ * +-------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For the above situation, the already set range E is updated to shrink its
++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ * The result is,
++ * +---------+
++ * | E |
++ * +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ * range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part ends at the start LBA of range E, and the second part starts at
++ * same LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ * handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part C1 ends at same end LBA of range E, and the second part starts
++ * at end LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part clearing range C1 can be handled as condition 4), and
++ * the second part clearing range C2 can be handled as condition 1) in next
++ * loop.
++ *
++ * All bad blocks range clearing can be simplified into the above 5 situations
++ * by only handling the head part of the clearing range in each run of the
++ * while-loop. The idea is similar to bad blocks range setting but much
++ * simpler.
+ */
+
+ /*
+@@ -932,6 +1049,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ return rv;
+ }
+
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *deleted)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int cleared = 0;
++
++ *deleted = 0;
++ if (s == BB_OFFSET(p[prev])) {
++ if (BB_LEN(p[prev]) > sectors) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++ BB_LEN(p[prev]) - sectors,
++ BB_ACK(p[prev]));
++ cleared = sectors;
++ } else {
++ /* BB_LEN(p[prev]) <= sectors */
++ cleared = BB_LEN(p[prev]);
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ *deleted = 1;
++ }
++ } else if (s > BB_OFFSET(p[prev])) {
++ if (BB_END(p[prev]) <= (s + sectors)) {
++ cleared = BB_END(p[prev]) - s;
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ } else {
++ /* Splitting is handled in front_splitting_clear() */
++ BUG();
++ }
++ }
++
++ return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ u64 end = BB_END(p[prev]);
++ int ack = BB_ACK(p[prev]);
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ ack);
++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++ return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ int len = 0, cleared = 0;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ sector_t target;
++
++ /* When clearing we round the start up and the end down.
++ * This should not matter as the shift should align with
++ * the block size and no rounding should ever be needed.
++ * However it is better the think a block is bad when it
++ * isn't than to think a block is not bad when it is.
++ */
++ target = s + sectors;
++ roundup(s, bb->shift);
++ rounddown(target, bb->shift);
++ sectors = target - s;
++ }
++
++ write_seqlock_irq(&bb->lock);
++
++ bad.ack = true;
++ p = bb->page;
++
++re_clear:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* Start before all badblocks */
++ if (prev < 0) {
++ if (overlap_behind(bb, &bad, 0)) {
++ len = BB_OFFSET(p[0]) - s;
++ hint = prev;
++ } else {
++ len = sectors;
++ }
++ /*
++ * Both situations are to clear non-bad range,
++ * should be treated as successful
++ */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Clear will split a bad record but the table is full */
++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + sectors))) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if ((BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + bad.len))) {
++ /* Splitting */
++ if ((bb->count + 1) < MAX_BADBLOCKS) {
++ len = front_splitting_clear(bb, prev, &bad);
++ bb->count += 1;
++ cleared++;
++ } else {
++ /* No space to split, give up */
++ len = sectors;
++ }
++ } else {
++ int deleted = 0;
++
++ len = front_clear(bb, prev, &bad, &deleted);
++ bb->count -= deleted;
++ cleared++;
++ hint = prev;
++ }
++
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Not cover any badblocks range in the table */
++ len = sectors;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_clear;
++
++ WARN_ON(sectors < 0);
++
++ if (cleared) {
++ badblocks_update_acked(bb);
++ set_changed(bb);
++ }
++
++ write_sequnlock_irq(&bb->lock);
++
++ if (!cleared)
++ rv = 1;
++
++ return rv;
++}
++
++
+ /**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
new file mode 100644
index 0000000..f7ba71a
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
@@ -0,0 +1,175 @@
+From 25e6c8d14293c3b45fcf239df7c88e05f1ee70bf Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 16:13:35 +0800
+Subject: [PATCH v5 5/6] badblocks: improve badblocks_check() for multiple
+ ranges handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+ +------------------------------------+
+ | C |
+ +------------------------------------+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+The improved badblocks_check() algorithm will divide checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+ +--+ +----+ +----+ +----+ +----+ +----+ +----+
+ +----+ +----+ +----+
+ | E1 | | E2 | | E3 |
+ +----+ +----+ +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+ Index No. Start Len Ack
+ 0 400 20 1
+ 1 500 50 1
+ 2 650 20 0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+ Checking Start, Len Return Value first_bad bad_sectors
+ 100, 100 0 N/A N/A
+ 100, 310 1 400 10
+ 100, 440 1 400 10
+ 100, 540 1 400 10
+ 100, 600 -1 400 10
+ 100, 800 -1 400 10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 97 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 3e1bb593a2bb..bfade2434c74 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1256,6 +1256,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ return rv;
+ }
+
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors)
++{
++ int unacked_badblocks, acked_badblocks;
++ int prev = -1, hint = -1, set = 0;
++ struct badblocks_context bad;
++ unsigned int seq;
++ int len, rv;
++ u64 *p;
++
++ WARN_ON(bb->shift < 0 || sectors == 0);
++
++ if (bb->shift > 0) {
++ sector_t target;
++
++ /* round the start down, and the end up */
++ target = s + sectors;
++ rounddown(s, bb->shift);
++ roundup(target, bb->shift);
++ sectors = target - s;
++ }
++
++retry:
++ seq = read_seqbegin(&bb->lock);
++
++ p = bb->page;
++ unacked_badblocks = 0;
++ acked_badblocks = 0;
++
++re_check:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (BB_ACK(p[prev]))
++ acked_badblocks++;
++ else
++ unacked_badblocks++;
++
++ if (BB_END(p[prev]) >= (s + sectors))
++ len = sectors;
++ else
++ len = BB_END(p[prev]) - s;
++
++ if (set == 0) {
++ *first_bad = BB_OFFSET(p[prev]);
++ *bad_sectors = BB_LEN(p[prev]);
++ set = 1;
++ }
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* not cover any badblocks range in the table */
++ len = sectors;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_check;
++
++ WARN_ON(sectors < 0);
++
++ if (unacked_badblocks > 0)
++ rv = -1;
++ else if (acked_badblocks > 0)
++ rv = 1;
++ else
++ rv = 0;
++
++ if (read_seqretry(&bb->lock, seq))
++ goto retry;
++
++ return rv;
++}
+
+ /**
+ * badblocks_check() - check a given range for bad sectors
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
new file mode 100644
index 0000000..837c7fe
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
@@ -0,0 +1,365 @@
+From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH v5 6/6] badblocks: switch to the improved badblock handling
+ code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index bfade2434c74..78f2af9295e6 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1391,75 +1391,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+ {
+- int hi;
+- int lo;
+- u64 *p = bb->page;
+- int rv;
+- sector_t target = s + sectors;
+- unsigned seq;
+-
+- if (bb->shift > 0) {
+- /* round the start down, and the end up */
+- s >>= bb->shift;
+- target += (1<<bb->shift) - 1;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+- /* 'target' is now the first block after the bad range */
+-
+-retry:
+- seq = read_seqbegin(&bb->lock);
+- lo = 0;
+- rv = 0;
+- hi = bb->count;
+-
+- /* Binary search between lo and hi for 'target'
+- * i.e. for the last range that starts before 'target'
+- */
+- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+- * are known not to be the last range before target.
+- * VARIANT: hi-lo is the number of possible
+- * ranges, and decreases until it reaches 1
+- */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- /* This could still be the one, earlier ranges
+- * could not.
+- */
+- lo = mid;
+- else
+- /* This and later ranges are definitely out. */
+- hi = mid;
+- }
+- /* 'lo' might be the last that started before target, but 'hi' isn't */
+- if (hi > lo) {
+- /* need to check all range that end after 's' to see if
+- * any are unacknowledged.
+- */
+- while (lo >= 0 &&
+- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+- if (BB_OFFSET(p[lo]) < target) {
+- /* starts before the end, and finishes after
+- * the start, so they must overlap
+- */
+- if (rv != -1 && BB_ACK(p[lo]))
+- rv = 1;
+- else
+- rv = -1;
+- *first_bad = BB_OFFSET(p[lo]);
+- *bad_sectors = BB_LEN(p[lo]);
+- }
+- lo--;
+- }
+- }
+-
+- if (read_seqretry(&bb->lock, seq))
+- goto retry;
+-
+- return rv;
++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+
+@@ -1481,154 +1413,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+ {
+- u64 *p;
+- int lo, hi;
+- int rv = 0;
+- unsigned long flags;
+-
+- if (bb->shift < 0)
+- /* badblocks are disabled */
+- return 1;
+-
+- if (bb->shift) {
+- /* round the start down, and the end up */
+- sector_t next = s + sectors;
+-
+- s >>= bb->shift;
+- next += (1<<bb->shift) - 1;
+- next >>= bb->shift;
+- sectors = next - s;
+- }
+-
+- write_seqlock_irqsave(&bb->lock, flags);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts at-or-before 's' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a <= s)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo && BB_OFFSET(p[lo]) > s)
+- hi = lo;
+-
+- if (hi > lo) {
+- /* we found a range that might merge with the start
+- * of our new range
+- */
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t e = a + BB_LEN(p[lo]);
+- int ack = BB_ACK(p[lo]);
+-
+- if (e >= s) {
+- /* Yes, we can merge with a previous range */
+- if (s == a && s + sectors >= e)
+- /* new range covers old */
+- ack = acknowledged;
+- else
+- ack = ack && acknowledged;
+-
+- if (e < s + sectors)
+- e = s + sectors;
+- if (e - a <= BB_MAX_LEN) {
+- p[lo] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- /* does not all fit in one range,
+- * make p[lo] maximal
+- */
+- if (BB_LEN(p[lo]) != BB_MAX_LEN)
+- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- }
+- }
+- if (sectors && hi < bb->count) {
+- /* 'hi' points to the first range that starts after 's'.
+- * Maybe we can merge with the start of that range
+- */
+- sector_t a = BB_OFFSET(p[hi]);
+- sector_t e = a + BB_LEN(p[hi]);
+- int ack = BB_ACK(p[hi]);
+-
+- if (a <= s + sectors) {
+- /* merging is possible */
+- if (e <= s + sectors) {
+- /* full overlap */
+- e = s + sectors;
+- ack = acknowledged;
+- } else
+- ack = ack && acknowledged;
+-
+- a = s;
+- if (e - a <= BB_MAX_LEN) {
+- p[hi] = BB_MAKE(a, e-a, ack);
+- s = e;
+- } else {
+- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+- s = a + BB_MAX_LEN;
+- }
+- sectors = e - s;
+- lo = hi;
+- hi++;
+- }
+- }
+- if (sectors == 0 && hi < bb->count) {
+- /* we might be able to combine lo and hi */
+- /* Note: 's' is at the end of 'lo' */
+- sector_t a = BB_OFFSET(p[hi]);
+- int lolen = BB_LEN(p[lo]);
+- int hilen = BB_LEN(p[hi]);
+- int newlen = lolen + hilen - (s - a);
+-
+- if (s >= a && newlen < BB_MAX_LEN) {
+- /* yes, we can combine them */
+- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+- memmove(p + hi, p + hi + 1,
+- (bb->count - hi - 1) * 8);
+- bb->count--;
+- }
+- }
+- while (sectors) {
+- /* didn't merge (it all).
+- * Need to add a range just before 'hi'
+- */
+- if (bb->count >= MAX_BADBLOCKS) {
+- /* No room for more */
+- rv = 1;
+- break;
+- } else {
+- int this_sectors = sectors;
+-
+- memmove(p + hi + 1, p + hi,
+- (bb->count - hi) * 8);
+- bb->count++;
+-
+- if (this_sectors > BB_MAX_LEN)
+- this_sectors = BB_MAX_LEN;
+- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+- sectors -= this_sectors;
+- s += this_sectors;
+- }
+- }
+-
+- bb->changed = 1;
+- if (!acknowledged)
+- bb->unacked_exist = 1;
+- else
+- badblocks_update_acked(bb);
+- write_sequnlock_irqrestore(&bb->lock, flags);
+-
+- return rv;
++ return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+
+@@ -1648,96 +1433,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+ */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+- u64 *p;
+- int lo, hi;
+- sector_t target = s + sectors;
+- int rv = 0;
+-
+- if (bb->shift > 0) {
+- /* When clearing we round the start up and the end down.
+- * This should not matter as the shift should align with
+- * the block size and no rounding should ever be needed.
+- * However it is better the think a block is bad when it
+- * isn't than to think a block is not bad when it is.
+- */
+- s += (1<<bb->shift) - 1;
+- s >>= bb->shift;
+- target >>= bb->shift;
+- sectors = target - s;
+- }
+-
+- write_seqlock_irq(&bb->lock);
+-
+- p = bb->page;
+- lo = 0;
+- hi = bb->count;
+- /* Find the last range that starts before 'target' */
+- while (hi - lo > 1) {
+- int mid = (lo + hi) / 2;
+- sector_t a = BB_OFFSET(p[mid]);
+-
+- if (a < target)
+- lo = mid;
+- else
+- hi = mid;
+- }
+- if (hi > lo) {
+- /* p[lo] is the last range that could overlap the
+- * current range. Earlier ranges could also overlap,
+- * but only this one can overlap the end of the range.
+- */
+- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* Partial overlap, leave the tail of this range */
+- int ack = BB_ACK(p[lo]);
+- sector_t a = BB_OFFSET(p[lo]);
+- sector_t end = a + BB_LEN(p[lo]);
+-
+- if (a < s) {
+- /* we need to split this range */
+- if (bb->count >= MAX_BADBLOCKS) {
+- rv = -ENOSPC;
+- goto out;
+- }
+- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+- bb->count++;
+- p[lo] = BB_MAKE(a, s-a, ack);
+- lo++;
+- }
+- p[lo] = BB_MAKE(target, end - target, ack);
+- /* there is no longer an overlap */
+- hi = lo;
+- lo--;
+- }
+- while (lo >= 0 &&
+- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+- (BB_OFFSET(p[lo]) < target)) {
+- /* This range does overlap */
+- if (BB_OFFSET(p[lo]) < s) {
+- /* Keep the early parts of this range. */
+- int ack = BB_ACK(p[lo]);
+- sector_t start = BB_OFFSET(p[lo]);
+-
+- p[lo] = BB_MAKE(start, s - start, ack);
+- /* now low doesn't overlap, so.. */
+- break;
+- }
+- lo--;
+- }
+- /* 'lo' is strictly before, 'hi' is strictly after,
+- * anything between needs to be discarded
+- */
+- if (hi - lo > 1) {
+- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+- bb->count -= (hi - lo - 1);
+- }
+- }
+-
+- badblocks_update_acked(bb);
+- bb->changed = 1;
+-out:
+- write_sequnlock_irq(&bb->lock);
+- return rv;
++ return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+
+--
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch
new file mode 100644
index 0000000..790b136
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch
@@ -0,0 +1,2303 @@
+From 249fc077edbeacb388b7aea11f1f2ce4c0a242c5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 10 Dec 2021 14:30:26 +0800
+Subject: [PATCH v5] test: user space code to test badblocks APIs
+
+This is the user space test code to verifiy badblocks API, not part of
+kernel patch, don't review this patch.
+
+Except for badblocks_show(), the rested code logic for badblocks_set(),
+badblocks_clear(), badblocks_check() are identical to the kernel code.
+
+The basic idea of the testing code follows the following steps,
+1) Generate a random bad blocks range (start offset and length), for
+ random set or clear operation. See write_badblocks_file() for this.
+2) Call badblocks_set() or badblocks_clear() APIs, and record the state
+ in a log file named with seq- prefix. See write_badblocks_log() for
+ this.
+3) Write sectors into dummy disk file for the corresponding bad blocks
+ range. E.g. the unacknowledged bad blocks setting writes value 1,
+ the acknowledged bad blocks setting writes value 2, and the clear
+ setting writes value 0. See _write_diskfile() for this.
+4) Compare all bad blocks ranges with the dummy disk file, if the sector
+ from the dummy disk file has unexpected value against the correspond-
+ ing bad block range, stop the loop of testing and ask people to do
+ manual verification from the seq-* log files. verify_badblocks_file()
+ does the verification.
+
+With this testing code, most of simple conditions are verified, only the
+complicated situations require manual check.
+
+There are 3 parameters can be modified in this test code,
+- MAX_BB_TEST_TRIES
+ How many times of the bad blocks set/clear and verification loop, the
+loop may exit earlier if verify_badblocks_file() encounters unexpected
+sector value and requires manual check.
+- MAX_SET_SIZE
+ The max size of random badblocks set range. A larger range may fill
+up all 512 badblock slots earlier.
+- MAX_CLN_SIZE
+ The max size of random badblocks clear range. A larger range may
+prevent all 512 badblock slots from being full filled.
+
+Of course the testing code is not perfect, this is the try-best effort
+to verify simple conditions of bad blocks setting/clearing with random
+generated ranges. For complicated situations, manual check by people are
+still necessary.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ Makefile | 4 +
+ badblocks.c | 2222 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 2226 insertions(+)
+ create mode 100644 Makefile
+ create mode 100644 badblocks.c
+
+diff --git a/Makefile b/Makefile
+new file mode 100644
+index 0000000..2287363
+--- /dev/null
++++ b/Makefile
+@@ -0,0 +1,4 @@
++badblocks: badblocks.o
++ gcc -o badblocks -g3 -Wall badblocks.c
++clean:
++ rm -f badblocks badblocks.o
+diff --git a/badblocks.c b/badblocks.c
+new file mode 100644
+index 0000000..e5b2cd0
+--- /dev/null
++++ b/badblocks.c
+@@ -0,0 +1,2222 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Bad block management
++ *
++ * - Heavily based on MD badblocks code from Neil Brown
++ *
++ * Copyright (c) 2015, Intel Corporation.
++ *
++ * Improvement for handling multiple ranges by Coly Li <colyli@suse.de>
++ */
++
++#define _GNU_SOURCE /* See feature_test_macros(7) */
++#include <stdlib.h>
++#include <linux/types.h>
++#include <stdio.h>
++#include <errno.h>
++#include <string.h>
++#include <limits.h>
++#include <assert.h>
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++
++extern int errno;
++
++#define PAGE_SIZE 4096
++typedef unsigned long long sector_t;
++typedef unsigned long long u64;
++typedef _Bool bool;
++
++#define BB_LEN_MASK (0x00000000000001FFULL)
++#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
++#define BB_ACK_MASK (0x8000000000000000ULL)
++#define BB_MAX_LEN 512
++#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
++#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x))
++#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
++#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
++
++/* Bad block numbers are stored in a single page.
++ * 64bits is used for each block or extent.
++ * 54 bits are sector number, 9 bits are extent size,
++ * 1 bit is an 'acknowledged' flag.
++ */
++#define MAX_BADBLOCKS (PAGE_SIZE/8)
++#define GFP_KERNEL 0
++#define true 1
++#define false 0
++
++#define WARN_ON(condition) ({ \
++ if (!!(condition)) \
++ printf("warning on %s:%d\n", __func__, __LINE__); \
++})
++
++#define BUG() ({printf("BUG on %s:%d\n", __func__, __LINE__); exit(1);})
++
++struct device {
++ int val;
++};
++
++struct badblocks {
++ struct device *dev;
++ int count; /* count of bad blocks */
++ int unacked_exist; /* there probably are unacknowledged
++ * bad blocks. This is only cleared
++ * when a read discovers none
++ */
++ int shift; /* shift from sectors to block size
++ * a -ve shift means badblocks are
++ * disabled.*/
++ u64 *page; /* badblock list */
++ int changed;
++ unsigned long lock;
++ sector_t sector;
++ sector_t size; /* in sectors */
++};
++
++struct badblocks_context {
++ sector_t start;
++ sector_t len;
++ sector_t orig_start;
++ sector_t orig_len;
++ int ack;
++ int first_prev;
++};
++
++int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors);
++int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged);
++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
++void ack_all_badblocks(struct badblocks *bb);
++ssize_t badblocks_show(struct badblocks *bb, int unack);
++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
++ int unack);
++int badblocks_init(struct badblocks *bb, int enable);
++void badblocks_exit(struct badblocks *bb);
++
++static inline void* kzalloc(int size, int flag)
++{
++ void * p = malloc(size);
++ memset(p, 0, size);
++ return p;
++}
++
++static inline void kfree(void* page)
++{
++ free(page);
++}
++
++#define roundup(x, y) ( \
++{ \
++ typeof(y) __y = y; \
++ (((x) + (__y - 1)) / __y) * __y; \
++} \
++)
++
++#define rounddown(x, y) ( \
++{ \
++ typeof(x) __x = (x); \
++ __x - (__x % (y)); \
++} \
++)
++
++#define fallthrough do{}while(0)
++
++/**
++ * min - return minimum of two values of the same or compatible types
++ * @x: first value
++ * @y: second value
++ */
++#define min(x, y) ((x) < (y) ? (x) : (y))
++#define min_t(t, x, y) ((x) < (y) ? (x) : (y))
++
++#define write_seqlock_irqsave(_lock, _flags) ((_flags) = *(_lock))
++#define write_sequnlock_irqrestore(_lock, _flags) ((*(_lock)) = (_flags))
++#define write_seqlock_irq(lock) do{}while(0)
++#define write_sequnlock_irq(lock) do{}while(0)
++#define read_seqbegin(lock) 1
++#define read_seqretry(lock, seq) (!!((seq) && 0))
++#define seqlock_init(lock) do{}while(0)
++#define EXPORT_SYMBOL_GPL(sym)
++
++static void *devm_kzalloc(struct device *dev, int size, int flags)
++{
++ void * buf = malloc(size);
++ if (buf)
++ memset(buf, 0, size);
++ return buf;
++}
++
++static void devm_kfree(struct device *dev, void *mem)
++{
++ free(mem);
++}
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++ return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++ return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++ if (bb->changed != 1)
++ bb->changed = 1;
++}
++
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ * prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ * block range.
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+ +-------------+
++ * | E1 | | E2 |
++ * +-------------+ +-------------+
++ * For this situation if the bad blocks table is not full, just allocate a
++ * free slot from the bad blocks table to mark the setting range S. The
++ * result is,
++ * +-------------+ +--------+ +-------------+
++ * | E1 | | S | | E2 |
++ * +-------------+ +--------+ +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ * range.
++ * 2.1) The setting range size < already set range size
++ * +--------+
++ * | S |
++ * +--------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ * An extra slot from the bad blocks table will be allocated for S, and head
++ * of E will move to end of the inserted range S. The result is,
++ * +--------+----+
++ * | S | E |
++ * +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ * be merged into existing bad range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ * the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++ bad blocks range E. The result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 2.3) The setting range size > already set range size
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For such situation, the setting range S can be treated as two parts, the
++ * first part (S1) is as same size as the already set range E, the second
++ * part (S2) is the rest of setting range.
++ * +-------------+-----+ +-------------+ +-----+
++ * | S1 | S2 | | S1 | | S2 |
++ * +-------------+-----+ ===> +-------------+ +-----+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now we only focus on how to handle the setting range S1 and already set
++ * range E, which are already explained in 2.2), for the rest S2 it will be
++ * handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ * range.
++ * +-------------+
++ * | S |
++ * +-------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation, the setting range S can be divided into two parts, the
++ * first (S1) ends at the start LBA of already set range E, the second part
++ * (S2) starts exactly at a start LBA of the already set range E.
++ * +----+---------+ +----+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +----+---------+ ===> +----+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now only the first part S1 should be handled in this loop, which is in
++ * similar condition as 1). The rest part S2 has exact same start LBA address
++ * of the already set range E, they will be handled in next loop in one of
++ * situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ * range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ * blocks range E, like the following chart shows,
++ * +---------+
++ * | S |
++ * +---------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +-------------+
++ * | S |
++ * +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is,
++ * +-------------+
++ * | E |
++ * +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ * overwrite the overlapped range of E, the result is,
++ * +---+---------+
++ * | E | S |
++ * +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ * the following chart shows,
++ * +----+
++ * | S |
++ * +----+
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ * they will be merged into one, the result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ * request of S will be rejected, the result is also,
++ * +--------------+
++ * | E |
++ * +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ * inserted into middle of E and split previous range E into twp parts (E1
++ * and E2), the result is,
++ * +----+----+----+
++ * | E1 | S | E2 |
++ * +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ * blocks range E. The range S starts after the start LBA of range E, and
++ * ends after the end LBA of range E, as the following chart shows,
++ * +-------------------+
++ * | S |
++ * +-------------------+
++ * +-------------+
++ * | E |
++ * +-------------+
++ * For this situation the range S can be divided into two parts, the first
++ * part (S1) ends at end range E, and the second part (S2) has rest range of
++ * origin S.
++ * +---------+---------+ +---------+ +---------+
++ * | S1 | S2 | | S1 | | S2 |
++ * +---------+---------+ ===> +---------+ +---------+
++ * +-------------+ +-------------+
++ * | E | | E |
++ * +-------------+ +-------------+
++ * Now in this loop the setting range S1 and already set range E can be
++ * handled as the situations 4), the rest range S2 will be handled in next
++ * loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ * blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ * range S and they are adjacent,
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can front merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ * range S right after already set range E into the bad blocks table. The
++ * result is,
++ * +--------+------+
++ * | E | S |
++ * +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ * +-------------------------------------------------------+
++ * | S |
++ * +-------------------------------------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+ +-----+ +-----+
++ * | E1 | | E2 | | E3 |
++ * +-----+ +-----+ +-----+
++ * In the above example, when the bad blocks table is full, inserting the
++ * first part of setting range S will fail because no more available slot
++ * can be allocated from bad blocks table. In this situation a proper
++ * setting method should be go though all the setting bad blocks range and
++ * look for chance to merge already set ranges into less ones. When there
++ * is available slot from bad blocks table, re-try again to handle more
++ * setting bad blocks ranges as many as possible.
++ * +------------------------+
++ * | S3 |
++ * +------------------------+
++ * |<----- BB_MAX_LEN ----->|
++ * +-----+-----+-----+---+-----+--+
++ * | S1 | S2 |
++ * +-----+-----+-----+---+-----+--+
++ * The above chart shows although the first part (S3) cannot be inserted due
++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ * can be merged with rest part of S into less range S1 and S2. Now there is
++ * 1 free slot in bad blocks table.
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * | S3 | S1 | S2 |
++ * +------------------------+-----+-----+-----+---+-----+--+
++ * Since the bad blocks table is not full anymore, re-try again for the
++ * origin setting range S. Now the setting range S3 can be inserted into the
++ * bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ * In the following example, in bad blocks table, E1 is an acked bad blocks
++ * range and E2 is an unacked bad blocks range, therefore they are not able
++ * to merge into a larger range. The setting bad blocks range S is acked,
++ * therefore part of E2 can be overwritten by S.
++ * +--------+
++ * | S | acknowledged
++ * +--------+ S: 1
++ * +-------+-------------+ E1: 1
++ * | E1 | E2 | E2: 0
++ * +-------+-------------+
++ * With previous simplified routines, after overwriting part of E2 with S,
++ * the bad blocks table should be (E3 is remaining part of E2 which is not
++ * overwritten by S),
++ * acknowledged
++ * +-------+--------+----+ S: 1
++ * | E1 | S | E3 | E1: 1
++ * +-------+--------+----+ E3: 0
++ * The above result is correct but not perfect. Range E1 and S in the bad
++ * blocks table are all acked, merging them into a larger one range may
++ * occupy less bad blocks table space and make badblocks_check() faster.
++ * Therefore in such situation, after overwriting range S, the previous range
++ * E1 should be checked for possible front combination. Then the ideal
++ * result can be,
++ * +----------------+----+ acknowledged
++ * | E1 | E3 | E1: 1
++ * +----------------+----+ E3: 0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ * range S and they are adjacent. Normally we don't need to care about this
++ * because front merge handles this while going though range S from head to
++ * tail, except for the tail part of range S. When the setting range S are
++ * fully handled, all the above simplified routine doesn't check whether the
++ * tail LBA of range S is adjacent to the next already set range and not able
++ * to them if they are mergeable.
++ * +------+
++ * | S |
++ * +------+
++ * +-------+
++ * | E |
++ * +-------+
++ * For the above special situation, when the setting range S are all handled
++ * and the loop ends, an extra check is necessary for whether next already
++ * set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ * values are same, the setting range S can behind merges into range E. The
++ * result is,
++ * +--------------+
++ * | S |
++ * +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ * S in front of the already set range E in the bad blocks table. The result
++ * is,
++ * +------+-------+
++ * | S | E |
++ * +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ * table.
++ * +-----+ | +-----+ | +-----+
++ * | C | | | C | | | C |
++ * +-----+ or +-----+ or +-----+
++ * +---+ | +----+ +----+ | +---+
++ * | E | | | E1 | | E2 | | | E |
++ * +---+ | +----+ +----+ | +---+
++ * For the above situations, no bad block to be cleared and no failure
++ * happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ * the bad block table.
++ * +---+
++ * | C |
++ * +---+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * In this situation if the bad block table is not full, the range E will be
++ * split into two ranges E1 and E2. The result is,
++ * +------+ +------+
++ * | E1 | | E2 |
++ * +------+ +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ * from the bad block table.
++ * 3.1) Partially covered at head part
++ * +------------+
++ * | C |
++ * +------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation, the overlapped already set range will update the
++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ * item deleted from bad block table. The result is,
++ * +----+
++ * | E1 |
++ * +----+
++ * 3.2) Exact fully covered
++ * +-----------------+
++ * | C |
++ * +-----------------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For this situation the whole bad blocks range E will be cleared and its
++ * corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ * range.
++ * +-------+
++ * | C |
++ * +-------+
++ * +-----------------+
++ * | E |
++ * +-----------------+
++ * For the above situation, the already set range E is updated to shrink its
++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ * The result is,
++ * +---------+
++ * | E |
++ * +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ * range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part ends at the start LBA of range E, and the second part starts at
++ * same LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ * handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ * range.
++ * +----------+
++ * | C |
++ * +----------+
++ * +------------+
++ * | E |
++ * +------------+
++ * For such situation, the clearing range C can be treated as two parts. The
++ * first part C1 ends at same end LBA of range E, and the second part starts
++ * at end LBA of range E.
++ * +----+-----+ +----+ +-----+
++ * | C1 | C2 | | C1 | | C2 |
++ * +----+-----+ ===> +----+ +-----+
++ * +------------+ +------------+
++ * | E | | E |
++ * +------------+ +------------+
++ * Now the first part clearing range C1 can be handled as condition 4), and
++ * the second part clearing range C2 can be handled as condition 1) in next
++ * loop.
++ *
++ * All bad blocks range clearing can be simplified into the above 5 situations
++ * by only handling the head part of the clearing range in each run of the
++ * while-loop. The idea is similar to bad blocks range setting but much
++ * simpler.
++ */
++
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++ int hint_end = hint + 2;
++ u64 *p = bb->page;
++ int ret = -1;
++
++ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++ (BB_OFFSET(p[hint]) <= s)) {
++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++ ret = hint;
++ break;
++ }
++ hint++;
++ }
++
++ return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++ int hint)
++{
++ sector_t s = bad->start;
++ int ret = -1;
++ int lo, hi;
++ u64 *p;
++
++ if (!bb->count)
++ goto out;
++
++ if (hint >= 0) {
++ ret = prev_by_hint(bb, s, hint);
++ if (ret >= 0)
++ goto out;
++ }
++
++ lo = 0;
++ hi = bb->count;
++ p = bb->page;
++
++ while (hi - lo > 1) {
++ int mid = (lo + hi)/2;
++ sector_t a = BB_OFFSET(p[mid]);
++
++ if (a <= s)
++ lo = mid;
++ else
++ hi = mid;
++ }
++
++ if (BB_OFFSET(p[lo]) <= s)
++ ret = lo;
++out:
++ return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++
++ if ((s <= BB_OFFSET(p[behind])) &&
++ ((s + sectors) >= BB_OFFSET(p[behind])) &&
++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++ BB_ACK(p[behind]) == bad->ack)
++ return true;
++ return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_OFFSET(p[behind]));
++ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++ if (s < BB_OFFSET(p[behind])) {
++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
++ } else {
++ merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++ }
++
++ WARN_ON(merged == 0);
++
++ return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++
++ if (BB_ACK(p[prev]) == bad->ack &&
++ (s < BB_END(p[prev]) ||
++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++ return true;
++ return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int merged = 0;
++
++ WARN_ON(s > BB_END(p[prev]));
++
++ if (s < BB_END(p[prev])) {
++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++ } else {
++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++ if ((prev + 1) < bb->count &&
++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++ }
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + merged, bad->ack);
++ }
++
++ return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if ((prev > 0) &&
++ (BB_OFFSET(p[prev]) == bad->start) &&
++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++ return true;
++ return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++ u64 *p = bb->page;
++
++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++ BB_ACK(p[prev]));
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++
++ if (bad->start >= BB_OFFSET(p[front]) &&
++ bad->start < BB_END(p[front]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++ int behind)
++{
++ u64 *p = bb->page;
++
++ if (bad->start < BB_OFFSET(p[behind]) &&
++ (bad->start + bad->len) > BB_OFFSET(p[behind]))
++ return true;
++ return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ * range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ * range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ * extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ * ranges will be split (ahead and after the overwritten range) and
++ * added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *extra)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(!overlap_front(bb, prev, bad));
++
++ if (BB_ACK(p[prev]) >= bad->ack)
++ return false;
++
++ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++ len = BB_END(p[prev]) - bad->start;
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 0;
++ else
++ *extra = 1;
++
++ bad->len = len;
++ } else {
++ if (BB_OFFSET(p[prev]) == bad->start)
++ *extra = 1;
++ else
++ /*
++ * prev range will be split into two, beside the overwritten
++ * one, an extra slot needed from bad table.
++ */
++ *extra = 2;
++ }
++
++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++ return false;
++
++ return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int extra)
++{
++ u64 *p = bb->page;
++ sector_t orig_end = BB_END(p[prev]);
++ int orig_ack = BB_ACK(p[prev]);
++
++ switch (extra) {
++ case 0:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++ bad->ack);
++ break;
++ case 1:
++ if (BB_OFFSET(p[prev]) == bad->start) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->len, bad->ack);
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start + bad->len,
++ orig_end - BB_END(p[prev]),
++ orig_ack);
++ } else {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ /*
++ * prev +2 -> prev + 1 + 1, which is for,
++ * 1) prev + 1: the slot index of the previous one
++ * 2) + 1: one more slot for extra being 1.
++ */
++ memmove(p + prev + 2, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ }
++ break;
++ case 2:
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ bad->start - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ /*
++ * prev + 3 -> prev + 1 + 2, which is for,
++ * 1) prev + 1: the slot index of the previous one
++ * 2) + 2: two more slots for extra being 2.
++ */
++ memmove(p + prev + 3, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++ orig_end - BB_END(p[prev + 1]),
++ BB_ACK(p[prev]));
++ break;
++ default:
++ break;
++ }
++
++ return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ int len;
++
++ WARN_ON(badblocks_full(bb));
++
++ len = min_t(sector_t, bad->len, BB_MAX_LEN);
++ if (at < bb->count)
++ memmove(p + at + 1, p + at, (bb->count - at) * 8);
++ p[at] = BB_MAKE(bad->start, len, bad->ack);
++
++ return len;
++}
++
++static void badblocks_update_acked(struct badblocks *bb)
++{
++ bool unacked = false;
++ u64 *p = bb->page;
++ int i;
++
++ if (!bb->unacked_exist)
++ return;
++
++ for (i = 0; i < bb->count ; i++) {
++ if (!BB_ACK(p[i])) {
++ unacked = true;
++ break;
++ }
++ }
++
++ if (!unacked)
++ bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged)
++{
++ int retried = 0, space_desired = 0;
++ int orig_len, len = 0, added = 0;
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ sector_t orig_start;
++ unsigned long flags;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ /* round the start down, and the end up */
++ sector_t next = s + sectors;
++
++ rounddown(s, bb->shift);
++ roundup(next, bb->shift);
++ sectors = next - s;
++ }
++
++ write_seqlock_irqsave(&bb->lock, flags);
++
++ orig_start = s;
++ orig_len = sectors;
++ bad.ack = acknowledged;
++ p = bb->page;
++
++re_insert:
++ bad.start = s;
++ bad.len = sectors;
++ len = 0;
++
++ if (badblocks_empty(bb)) {
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start before all badblocks */
++ if (prev < 0) {
++ if (!badblocks_full(bb)) {
++ /* insert on the first */
++ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++ bad.len = BB_OFFSET(p[0]) - bad.start;
++ len = insert_at(bb, 0, &bad);
++ bb->count++;
++ added++;
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* No sapce, try to merge */
++ if (overlap_behind(bb, &bad, 0)) {
++ if (can_merge_behind(bb, &bad, 0)) {
++ len = behind_merge(bb, &bad, 0);
++ added++;
++ } else {
++ len = min_t(sector_t,
++ BB_OFFSET(p[0]) - s, sectors);
++ space_desired = 1;
++ }
++ hint = 0;
++ goto update_sectors;
++ }
++
++ /* no table space and give up */
++ goto out;
++ }
++
++ /* in case p[prev-1] can be merged with p[prev] */
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ } else {
++ int extra = 0;
++
++ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++ len = min_t(sector_t,
++ BB_END(p[prev]) - s, sectors);
++ hint = prev;
++ goto update_sectors;
++ }
++
++ len = front_overwrite(bb, prev, &bad, extra);
++ added++;
++ bb->count += extra;
++
++ if (can_combine_front(bb, prev, &bad)) {
++ front_combine(bb, prev);
++ bb->count--;
++ }
++ }
++ hint = prev;
++ goto update_sectors;
++ }
++
++ if (can_merge_front(bb, prev, &bad)) {
++ len = front_merge(bb, prev, &bad);
++ added++;
++ hint = prev;
++ goto update_sectors;
++ }
++
++ /* if no space in table, still try to merge in the covered range */
++ if (badblocks_full(bb)) {
++ /* skip the cannot-merge range */
++ if (((prev + 1) < bb->count) &&
++ overlap_behind(bb, &bad, prev + 1) &&
++ ((s + sectors) >= BB_END(p[prev + 1]))) {
++ len = BB_END(p[prev + 1]) - s;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* no retry any more */
++ len = sectors;
++ space_desired = 1;
++ hint = -1;
++ goto update_sectors;
++ }
++
++ /* cannot merge and there is space in bad table */
++ if ((prev + 1) < bb->count &&
++ overlap_behind(bb, &bad, prev + 1))
++ bad.len = min_t(sector_t,
++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++ len = insert_at(bb, prev + 1, &bad);
++ bb->count++;
++ added++;
++ hint = prev + 1;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_insert;
++
++ WARN_ON(sectors < 0);
++
++ /* Check whether the following already set range can be merged */
++ if ((prev + 1) < bb->count &&
++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++ BB_ACK(p[prev]));
++
++ if ((prev + 2) < bb->count)
++ memmove(p + prev + 1, p + prev + 2,
++ (bb->count - (prev + 2)) * 8);
++ bb->count--;
++ }
++
++ if (space_desired && !badblocks_full(bb)) {
++ s = orig_start;
++ sectors = orig_len;
++ space_desired = 0;
++ if (retried++ < 3)
++ goto re_insert;
++ }
++
++out:
++ if (added) {
++ set_changed(bb);
++
++ if (!acknowledged)
++ bb->unacked_exist = 1;
++ else
++ badblocks_update_acked(bb);
++ }
++
++ write_sequnlock_irqrestore(&bb->lock, flags);
++
++ if (!added)
++ rv = 1;
++
++ return rv;
++}
++
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad, int *deleted)
++{
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++ u64 *p = bb->page;
++ int cleared = 0;
++
++ *deleted = 0;
++ if (s == BB_OFFSET(p[prev])) {
++ if (BB_LEN(p[prev]) > sectors) {
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++ BB_LEN(p[prev]) - sectors,
++ BB_ACK(p[prev]));
++ cleared = sectors;
++ } else {
++ /* BB_LEN(p[prev]) <= sectors */
++ cleared = BB_LEN(p[prev]);
++ if ((prev + 1) < bb->count)
++ memmove(p + prev, p + prev + 1,
++ (bb->count - prev - 1) * 8);
++ *deleted = 1;
++ }
++ } else if (s > BB_OFFSET(p[prev])) {
++ if (BB_END(p[prev]) <= (s + sectors)) {
++ cleared = BB_END(p[prev]) - s;
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ BB_ACK(p[prev]));
++ } else {
++ /* Splitting is handled in front_splitting_clear() */
++ BUG();
++ }
++ }
++
++ return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++ struct badblocks_context *bad)
++{
++ u64 *p = bb->page;
++ u64 end = BB_END(p[prev]);
++ int ack = BB_ACK(p[prev]);
++ sector_t sectors = bad->len;
++ sector_t s = bad->start;
++
++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++ s - BB_OFFSET(p[prev]),
++ ack);
++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++ return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++ struct badblocks_context bad;
++ int prev = -1, hint = -1;
++ int len = 0, cleared = 0;
++ int rv = 0;
++ u64 *p;
++
++ if (bb->shift < 0)
++ /* badblocks are disabled */
++ return 1;
++
++ if (sectors == 0)
++ /* Invalid sectors number */
++ return 1;
++
++ if (bb->shift) {
++ sector_t target;
++
++ /* When clearing we round the start up and the end down.
++ * This should not matter as the shift should align with
++ * the block size and no rounding should ever be needed.
++ * However it is better the think a block is bad when it
++ * isn't than to think a block is not bad when it is.
++ */
++ target = s + sectors;
++ roundup(s, bb->shift);
++ rounddown(target, bb->shift);
++ sectors = target - s;
++ }
++
++ write_seqlock_irq(&bb->lock);
++
++ bad.ack = true;
++ p = bb->page;
++
++re_clear:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* Start before all badblocks */
++ if (prev < 0) {
++ if (overlap_behind(bb, &bad, 0)) {
++ len = BB_OFFSET(p[0]) - s;
++ hint = prev;
++ } else {
++ len = sectors;
++ }
++ /*
++ * Both situations are to clear non-bad range,
++ * should be treated as successful
++ */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Clear will split a bad record but the table is full */
++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + sectors))) {
++ len = sectors;
++ printf("Warn: no space to split for clear\n");
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if ((BB_OFFSET(p[prev]) < bad.start) &&
++ (BB_END(p[prev]) > (bad.start + bad.len))) {
++ /* Splitting */
++ if ((bb->count + 1) < MAX_BADBLOCKS) {
++ len = front_splitting_clear(bb, prev, &bad);
++ bb->count += 1;
++ cleared++;
++ } else {
++ /* No space to split, give up */
++ printf("Warn: no space to split for clear\n");
++ len = sectors;
++ }
++ } else {
++ int deleted = 0;
++
++ len = front_clear(bb, prev, &bad, &deleted);
++ bb->count -= deleted;
++ cleared++;
++ hint = prev;
++ }
++
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++ goto update_sectors;
++ }
++
++ /* Not cover any badblocks range in the table */
++ len = sectors;
++ /* Clear non-bad range should be treated as successful */
++ cleared++;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_clear;
++
++ WARN_ON(sectors < 0);
++
++ if (cleared) {
++ badblocks_update_acked(bb);
++ set_changed(bb);
++ }
++
++ write_sequnlock_irq(&bb->lock);
++
++ if (!cleared)
++ rv = 1;
++
++ return rv;
++}
++
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors)
++{
++ int unacked_badblocks, acked_badblocks;
++ int prev = -1, hint = -1, set = 0;
++ struct badblocks_context bad;
++ unsigned int seq;
++ int len, rv;
++ u64 *p;
++
++ WARN_ON(bb->shift < 0 || sectors == 0);
++
++ if (bb->shift > 0) {
++ sector_t target;
++
++ /* round the start down, and the end up */
++ target = s + sectors;
++ rounddown(s, bb->shift);
++ roundup(target, bb->shift);
++ sectors = target - s;
++ }
++
++retry:
++ seq = read_seqbegin(&bb->lock);
++
++ p = bb->page;
++ unacked_badblocks = 0;
++ acked_badblocks = 0;
++
++re_check:
++ bad.start = s;
++ bad.len = sectors;
++
++ if (badblocks_empty(bb)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ prev = prev_badblocks(bb, &bad, hint);
++
++ /* start after all badblocks */
++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++ len = sectors;
++ goto update_sectors;
++ }
++
++ if (overlap_front(bb, prev, &bad)) {
++ if (BB_ACK(p[prev]))
++ acked_badblocks++;
++ else
++ unacked_badblocks++;
++
++ if (BB_END(p[prev]) >= (s + sectors))
++ len = sectors;
++ else
++ len = BB_END(p[prev]) - s;
++
++ if (set == 0) {
++ *first_bad = BB_OFFSET(p[prev]);
++ *bad_sectors = BB_LEN(p[prev]);
++ set = 1;
++ }
++ goto update_sectors;
++ }
++
++ /* Not front overlap, but behind overlap */
++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++ len = BB_OFFSET(p[prev + 1]) - bad.start;
++ hint = prev + 1;
++ goto update_sectors;
++ }
++
++ /* not cover any badblocks range in the table */
++ len = sectors;
++
++update_sectors:
++ s += len;
++ sectors -= len;
++
++ if (sectors > 0)
++ goto re_check;
++
++ WARN_ON(sectors < 0);
++
++ if (unacked_badblocks > 0)
++ rv = -1;
++ else if (acked_badblocks > 0)
++ rv = 1;
++ else
++ rv = 0;
++
++ if (read_seqretry(&bb->lock, seq))
++ goto retry;
++
++ return rv;
++}
++
++/**
++ * badblocks_check() - check a given range for bad sectors
++ * @bb: the badblocks structure that holds all badblock information
++ * @s: sector (start) at which to check for badblocks
++ * @sectors: number of sectors to check for badblocks
++ * @first_bad: pointer to store location of the first badblock
++ * @bad_sectors: pointer to store number of badblocks after @first_bad
++ *
++ * We can record which blocks on each device are 'bad' and so just
++ * fail those blocks, or that stripe, rather than the whole device.
++ * Entries in the bad-block table are 64bits wide. This comprises:
++ * Length of bad-range, in sectors: 0-511 for lengths 1-512
++ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
++ * A 'shift' can be set so that larger blocks are tracked and
++ * consequently larger devices can be covered.
++ * 'Acknowledged' flag - 1 bit. - the most significant bit.
++ *
++ * Locking of the bad-block table uses a seqlock so badblocks_check
++ * might need to retry if it is very unlucky.
++ * We will sometimes want to check for bad blocks in a bi_end_io function,
++ * so we use the write_seqlock_irq variant.
++ *
++ * When looking for a bad block we specify a range and want to
++ * know if any block in the range is bad. So we binary-search
++ * to the last range that starts at-or-before the given endpoint,
++ * (or "before the sector after the target range")
++ * then see if it ends after the given start.
++ *
++ * Return:
++ * 0: there are no known bad blocks in the range
++ * 1: there are known bad block which are all acknowledged
++ * -1: there are bad blocks which have not yet been acknowledged in metadata.
++ * plus the start/length of the first bad section we overlap.
++ */
++int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++ sector_t *first_bad, int *bad_sectors)
++{
++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
++}
++EXPORT_SYMBOL_GPL(badblocks_check);
++
++/**
++ * badblocks_set() - Add a range of bad blocks to the table.
++ * @bb: the badblocks structure that holds all badblock information
++ * @s: first sector to mark as bad
++ * @sectors: number of sectors to mark as bad
++ * @acknowledged: weather to mark the bad sectors as acknowledged
++ *
++ * This might extend the table, or might contract it if two adjacent ranges
++ * can be merged. We binary-search to find the 'insertion' point, then
++ * decide how best to handle it.
++ *
++ * Return:
++ * 0: success
++ * 1: failed to set badblocks (out of space)
++ */
++int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++ int acknowledged)
++{
++ return _badblocks_set(bb, s, sectors, acknowledged);
++}
++EXPORT_SYMBOL_GPL(badblocks_set);
++
++/**
++ * badblocks_clear() - Remove a range of bad blocks to the table.
++ * @bb: the badblocks structure that holds all badblock information
++ * @s: first sector to mark as bad
++ * @sectors: number of sectors to mark as bad
++ *
++ * This may involve extending the table if we spilt a region,
++ * but it must not fail. So if the table becomes full, we just
++ * drop the remove request.
++ *
++ * Return:
++ * 0: success
++ * 1: failed to clear badblocks
++ */
++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++ return _badblocks_clear(bb, s, sectors);
++}
++EXPORT_SYMBOL_GPL(badblocks_clear);
++
++/**
++ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
++ * @bb: the badblocks structure that holds all badblock information
++ *
++ * This only succeeds if ->changed is clear. It is used by
++ * in-kernel metadata updates
++ */
++void ack_all_badblocks(struct badblocks *bb)
++{
++ if (bb->page == NULL || bb->changed)
++ /* no point even trying */
++ return;
++ write_seqlock_irq(&bb->lock);
++
++ if (bb->changed == 0 && bb->unacked_exist) {
++ u64 *p = bb->page;
++ int i;
++
++ for (i = 0; i < bb->count ; i++) {
++ if (!BB_ACK(p[i])) {
++ sector_t start = BB_OFFSET(p[i]);
++ int len = BB_LEN(p[i]);
++
++ p[i] = BB_MAKE(start, len, 1);
++ }
++ }
++ bb->unacked_exist = 0;
++ }
++ write_sequnlock_irq(&bb->lock);
++}
++EXPORT_SYMBOL_GPL(ack_all_badblocks);
++
++/**
++ * badblocks_show() - sysfs access to bad-blocks list
++ * @bb: the badblocks structure that holds all badblock information
++ * @page: buffer received from sysfs
++ * @unack: weather to show unacknowledged badblocks
++ *
++ * Return:
++ * Length of returned data
++ */
++ssize_t badblocks_show(struct badblocks *bb, int unack)
++{
++ size_t len;
++ int i;
++ u64 *p = bb->page;
++ char * _page;
++ int size = 64*4096;
++ unsigned seq;
++
++ if (bb->shift < 0)
++ return 0;
++
++ _page = malloc(size);
++ if (!_page) {
++ printf("alloc _page failed\n");
++ return 0;
++ }
++ memset(_page, 0, size);
++retry:
++ seq = read_seqbegin(&bb->lock);
++
++ len = 0;
++ i = 0;
++
++ while (len < size&& i < bb->count) {
++ sector_t s = BB_OFFSET(p[i]);
++ unsigned int length = BB_LEN(p[i]);
++ int ack = BB_ACK(p[i]);
++
++ i++;
++
++ if (unack && ack)
++ continue;
++
++ len += snprintf(_page+len, size - len, "%llu %u\n",
++ (unsigned long long)s << bb->shift,
++ length << bb->shift);
++ }
++ if (unack && len == 0)
++ bb->unacked_exist = 0;
++
++ printf("%s\n", _page);
++ free(_page);
++
++ if (read_seqretry(&bb->lock, seq))
++ goto retry;
++
++ return len;
++}
++EXPORT_SYMBOL_GPL(badblocks_show);
++
++/**
++ * badblocks_store() - sysfs access to bad-blocks list
++ * @bb: the badblocks structure that holds all badblock information
++ * @page: buffer received from sysfs
++ * @len: length of data received from sysfs
++ * @unack: weather to show unacknowledged badblocks
++ *
++ * Return:
++ * Length of the buffer processed or -ve error.
++ */
++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
++ int unack)
++{
++ unsigned long long sector;
++ int length;
++ char newline;
++
++ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
++ case 3:
++ if (newline != '\n')
++ return -EINVAL;
++ fallthrough;
++ case 2:
++ if (length <= 0)
++ return -EINVAL;
++ break;
++ default:
++ return -EINVAL;
++ }
++
++ if (badblocks_set(bb, sector, length, !unack))
++ return -ENOSPC;
++ else
++ return len;
++}
++EXPORT_SYMBOL_GPL(badblocks_store);
++
++static int __badblocks_init(struct device *dev, struct badblocks *bb,
++ int enable)
++{
++ bb->dev = dev;
++ bb->count = 0;
++ if (enable)
++ bb->shift = 0;
++ else
++ bb->shift = -1;
++ if (dev)
++ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
++ else
++ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
++ if (!bb->page) {
++ bb->shift = -1;
++ return -ENOMEM;
++ }
++ seqlock_init(&bb->lock);
++
++ return 0;
++}
++
++/**
++ * badblocks_init() - initialize the badblocks structure
++ * @bb: the badblocks structure that holds all badblock information
++ * @enable: weather to enable badblocks accounting
++ *
++ * Return:
++ * 0: success
++ * -ve errno: on error
++ */
++int badblocks_init(struct badblocks *bb, int enable)
++{
++ return __badblocks_init(NULL, bb, enable);
++}
++EXPORT_SYMBOL_GPL(badblocks_init);
++
++int devm_init_badblocks(struct device *dev, struct badblocks *bb)
++{
++ if (!bb)
++ return -EINVAL;
++ return __badblocks_init(dev, bb, 1);
++}
++EXPORT_SYMBOL_GPL(devm_init_badblocks);
++
++/**
++ * badblocks_exit() - free the badblocks structure
++ * @bb: the badblocks structure that holds all badblock information
++ */
++void badblocks_exit(struct badblocks *bb)
++{
++ if (!bb)
++ return;
++ if (bb->dev)
++ devm_kfree(bb->dev, bb->page);
++ else
++ kfree(bb->page);
++ bb->page = NULL;
++}
++EXPORT_SYMBOL_GPL(badblocks_exit);
++
++
++/*
++ * Test case related
++ */
++char good_sector[512];
++char bad_unack_sector[512];
++char bad_acked_sector[512];
++
++#define BB_SET 0
++#define BB_CLN 1
++
++unsigned rand_seed = 2;
++
++char bb_ops[] = {0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1};
++char bb_ack[] = {1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0};
++
++/* disk file lengh is 256MB */
++#define DISKFILE_SECTORS ((256 << 20) >> 9)
++#define MAX_SET_SIZE (DISKFILE_SECTORS/256)
++#define MAX_CLN_SIZE (DISKFILE_SECTORS/1024)
++
++#define BUF_LEN (8<<10)
++
++void write_badblocks_log(struct badblocks *bb, char *dir, unsigned long seq,
++ sector_t bb_start, sector_t bb_len,
++ int ops, int ack)
++{
++ char path[512];
++ char buf[8192];
++ u64 *p = bb->page;
++ int len, size, i;
++ int fd;
++
++
++ size = sizeof(buf);
++ memset(buf, 0, sizeof(buf));
++ len = 0;
++
++ len += snprintf(buf + len, size - len, "============ %lu ============\n\n", seq);
++ if (ops == BB_SET)
++ len += snprintf(buf + len, size - len, "set: start %llu, len %llu, ack %d\n",
++ bb_start, bb_len, ack);
++ else
++ len += snprintf(buf + len, size - len, "clear: start %llu, len %llu\n",
++ bb_start, bb_len);
++
++ len += snprintf(buf + len, size - len, "=============================\n\n");
++
++ i = 0;
++ while (len < size && i < bb->count) {
++ sector_t s = BB_OFFSET(p[i]);
++ unsigned int length = BB_LEN(p[i]);
++ int ack = BB_ACK(p[i]);
++
++ i++;
++
++ len += snprintf(buf + len, size - len, "%llu %u [%u]\n",
++ (unsigned long long)s << bb->shift,
++ length << bb->shift,
++ ack);
++ }
++
++ snprintf(path, 512, "%s/seq-%.8lu", dir ? dir : ".", seq);
++ unlink(path);
++ fd = open(path, O_CREAT|O_RDWR, 0644);
++ if (fd < 0) {
++ printf("fail to create file %s\n", path);
++ return;
++ }
++ write(fd, buf, len);
++ fsync(fd);
++ close(fd);
++}
++
++
++int verify_bad_sectors(sector_t start, sector_t len, int expected, int fd)
++{
++ int ret = 0;
++ char buf[BUF_LEN];
++ unsigned long offset = start << 9;
++ unsigned long unread = len << 9;
++
++ if ((start + len) > DISKFILE_SECTORS)
++ printf("Error: invalid verify range: s %llu, l %llu\n, limit %u\n",
++ start, len, DISKFILE_SECTORS);
++
++ while (unread > 0) {
++ unsigned long read_bytes = min(unread, BUF_LEN);
++ unsigned long i;
++ ssize_t _ret;
++
++ memset(buf, 0, sizeof(buf));
++ _ret = pread(fd, buf, read_bytes, offset);
++ if (_ret != read_bytes) {
++ printf("Error: to read %lu bytes, return %lu bytes\n",
++ read_bytes, _ret);
++ }
++
++ for (i = 0; i < read_bytes; i++) {
++ if (buf[i] != expected) {
++ printf("Unexpected sector value %u (should be %u) at sector %lu"
++ " offset byte %lu\n",
++ buf[i], expected, (offset+i) >> 9,
++ (offset + i) % 512);
++ exit(1);
++ if (ret == 0)
++ ret = -EIO;
++ }
++ }
++
++ if (ret)
++ goto out;
++
++ unread -= read_bytes;
++ offset += read_bytes;
++ }
++
++out:
++ return ret;
++}
++
++int verify_badblocks_file(struct badblocks *bb, int fd, unsigned long seq)
++{
++ int ret = 0;
++ sector_t size = DISKFILE_SECTORS;
++ u64 *p = bb->page;
++ int i = 0;
++ unsigned long prev_pos, pos;
++
++ prev_pos = pos = 0;
++ while ((size > 0) && (i < bb->count)) {
++ sector_t s = BB_OFFSET(p[i]);
++ unsigned int length = BB_LEN(p[i]);
++ int ack = BB_ACK(p[i]);
++
++ pos = s;
++
++ /* verify non-bad area */
++ if (pos > prev_pos) {
++ ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd);
++ if (ret < 0) {
++ printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n",
++ __func__, __LINE__, prev_pos, pos, strerror(-ret));
++ goto out;
++ }
++
++ size -= (pos - prev_pos);
++ }
++
++ /* verify bad area */
++ ret = verify_bad_sectors(pos, length, ack ? 2 : 1, fd);
++ if (ret < 0) {
++ printf("%s:%d fail to verify bad sectors [%lu, %u) ack %d, error: %s\n",
++ __func__, __LINE__, pos, length, ack, strerror(ret));
++ goto out;
++ }
++
++ size -= length;
++ i++;
++ prev_pos = pos + length;
++ }
++
++ if (i < bb->count) {
++ printf("Error: total %d bad records, verified %d, left %d\n",
++ bb->count, i, bb->count - i);
++ if (size)
++ printf("Error: still have %llu sectors not verified\n",
++ size);
++ ret = -EIO;
++ goto out;
++ }
++
++ /* verify rested non-bad area */
++ if (size) {
++ pos = DISKFILE_SECTORS;
++ ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd);
++ if (ret < 0) {
++ printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n",
++ __func__, __LINE__, prev_pos, pos, strerror(-ret));
++ goto out;
++ }
++ }
++
++ printf("verify badblocks file successfully (seq %lu)\n", seq);
++out:
++ return ret;
++}
++
++
++int _write_diskfile(int fd, int ops,
++ sector_t start, sector_t len, int ack)
++{
++ off_t pos = start << 9;
++ char sector[512];
++
++ if ((start + len) > DISKFILE_SECTORS)
++ len = DISKFILE_SECTORS - start;
++
++ if (len == 0) {
++ printf("Error: write diskfile zero-length at %llu len %llu\n",
++ start, len);
++ return -EINVAL;
++ }
++
++ if (ops == BB_CLN) {
++ while (len > 0) {
++ pwrite(fd, good_sector, 512, pos);
++ pos += 512;
++ len--;
++ }
++ fsync(fd);
++ return 0;
++ }
++
++ /* badblocks set */
++ while (len > 0) {
++ pread(fd, sector, 512, pos);
++ if (!memcmp(sector, good_sector, 512)) {
++ if (ack)
++ pwrite(fd, bad_acked_sector, 512, pos);
++ else
++ pwrite(fd, bad_unack_sector, 512, pos);
++
++// printf("write %d at sector %lu\n", ack ? 2 : 1, pos >> 9);
++ } else if (!memcmp(sector, bad_unack_sector, 512)) {
++ if (ack) {
++ pwrite(fd, bad_acked_sector, 512, pos);
++// printf("overwrite 2 at unack sector %lu\n", pos >> 9);
++ } else {
++// printf("avoid overwrite already unacked sector %lu\n", pos >> 9);
++ }
++ } else if (!memcmp(sector, bad_acked_sector, 512)) {
++// if (ack)
++// printf("avoid overwrite already acked sector %lu\n", pos >> 9);
++// else
++// printf("cannot overwrite acked sector %lu\n", pos >> 9);
++ } else {
++ printf("Error: unexpected sector at %lu\n", pos >> 9);
++ }
++
++ pos += 512;
++ len--;
++ }
++
++ fsync(fd);
++ return 0;
++}
++
++sector_t fix_writing_length(struct badblocks*bb, int ops, sector_t bb_start,
++ sector_t bb_len, int ack)
++{
++ sector_t orig_len = bb_len;
++ sector_t ret_len = 0;
++ int prev;
++ struct badblocks_context bad;
++ u64 *p = bb->page;
++
++ bad.orig_start = bb_start;
++ bad.orig_len = bb_len;
++ bad.start = bb_start;
++ bad.len = bb_len;
++ bad.ack = ack;
++
++
++ if (ops == BB_SET) {
++ prev = prev_badblocks(bb, &bad, -1);
++ if (prev < 0) {
++ printf("Unexpected: the set range is not in badblocks table\n");
++ exit(1);
++ }
++
++ if (BB_OFFSET(p[prev]) > bb_start ||
++ BB_END(p[prev]) <= bb_start ||
++ BB_ACK(p[prev]) != ack) {
++ printf("Unexpected: fixing range is not in badblocks table\n");
++ exit(1);
++ }
++
++ while (bb_len > 0) {
++ int seg;
++
++ if (BB_END(p[prev]) >= (bb_start + bb_len))
++ seg = bb_len;
++ else
++ seg = BB_END(p[prev]) - bb_start;
++
++ ret_len += seg;
++ bb_start += seg;
++ bb_len -= seg;
++
++ if (bb_len == 0)
++ break;
++
++ if ((prev + 1) >= bb->count ||
++ BB_END(p[prev]) != BB_OFFSET(p[prev + 1]) ||
++ BB_ACK(p[prev]) != BB_ACK(p[prev + 1]))
++ break;
++ prev++;
++ }
++ } else if (ops == BB_CLN) {
++ ret_len = bb_len;
++
++ }
++
++ printf("Fix writing bb_len from %llu to %llu\n", orig_len, ret_len);
++ return ret_len;
++}
++
++int write_badblocks_file(struct badblocks *bb, unsigned long seq, int fd)
++{
++ int ret;
++ sector_t bb_start, bb_len;
++ int ops, random;
++
++retry:
++ random = rand_r(&rand_seed);
++ ops = bb_ops[random % sizeof(bb_ops)];
++ random = rand_r(&rand_seed);
++ if (ops == BB_SET)
++ bb_len = random % MAX_SET_SIZE;
++ else
++ bb_len= random % MAX_CLN_SIZE;
++ random = rand_r(&rand_seed);
++ bb_start = random % DISKFILE_SECTORS;
++ if ((bb_start + bb_len) > DISKFILE_SECTORS)
++ bb_len = DISKFILE_SECTORS - bb_start;
++ if (bb_len == 0) {
++ printf("random bb_len is 0, re-generate\n");
++ goto retry;
++ }
++
++
++ if (ops == BB_SET) {
++ int ack;
++
++ random = rand_r(&rand_seed);
++ ack = bb_ack[random % sizeof(bb_ack)];
++
++ bb->changed = 0;
++ ret = badblocks_set(bb, bb_start, bb_len, ack);
++ write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_SET, ack);
++ if (ret > 0) {
++ printf("NOTICE: no space or cannot overwwrite badblocks"
++ " for badblocks_set(s: %llu, l: %llu, a: %d).\n"
++ " Manual check might be necessary if\n"
++ " following verification failed.\n",
++ bb_start, bb_len, ack);
++ return 1;
++ }
++
++ if (badblocks_full(bb) && bb->changed)
++ bb_len = fix_writing_length(bb, ops, bb_start, bb_len, ack);
++ ret = _write_diskfile(fd, ops, bb_start, bb_len, ack);
++ } else {
++ bb->changed = 0;
++ ret = badblocks_clear(bb, bb_start, bb_len);
++ write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_CLN, -1);
++ if (ret > 0) {
++ printf("NOTICE: no space for badblocks_clear(s: %llu, l: %llu)\n"
++ " Manual check might be necessary if\n"
++ " following verification failed.\n",
++ bb_start, bb_len);
++ return 1;
++ }
++
++ ret = _write_diskfile(fd, ops, bb_start, bb_len, -1);
++ }
++
++ return ret;
++}
++
++#define MAX_BB_TEST_TRIES (1<<20)
++int do_test(struct badblocks *bb)
++{
++ int ret = 0;
++ unsigned long seq;
++ char diskfile_name[] = "./dummy_disk_file";
++ int diskfile_fd = -1;
++
++ srand(rand_seed);
++
++ unlink(diskfile_name);
++ diskfile_fd = open(diskfile_name, O_CREAT|O_RDWR, 0644);
++ if (diskfile_fd < 0) {
++ printf("fail to create %s, error %s\n",
++ diskfile_name, strerror(errno));
++ goto out;
++ }
++ ret = fallocate(diskfile_fd, FALLOC_FL_ZERO_RANGE, 0, DISKFILE_SECTORS << 9);
++ if (ret < 0) {
++ printf("fail to allocate zero-filled file, error %s\n",
++ strerror(errno));
++ goto out;
++ }
++
++ for (seq = 1; seq <= MAX_BB_TEST_TRIES; seq++) {
++ ret = write_badblocks_file(bb, seq, diskfile_fd);
++ if (ret < 0) {
++ printf("fail to generate bad blocks for seq %lu, error %s\n",
++ seq, strerror(-ret));
++ goto out;
++ }
++ ret = verify_badblocks_file(bb, diskfile_fd, seq);
++ if (ret < 0) {
++ printf("fail to verify bad blocks for seq %lu, error %s\n",
++ seq, strerror(-ret));
++ }
++ }
++
++out:
++ if (diskfile_fd >= 0)
++ close(diskfile_fd);
++ return ret;
++}
++
++int main(int argc, char *argv[])
++{
++ struct badblocks bblocks;
++ struct badblocks *bb = &bblocks;
++ int i;
++
++ for (i = 0; i < 512; i++) {
++ good_sector[i] = 0;
++ bad_unack_sector[i] = 1;
++ bad_acked_sector[i] = 2;
++ }
++
++ memset(bb, 0, sizeof(struct badblocks));
++ badblocks_init(bb, 1);
++
++ do_test(bb);
++
++ badblocks_exit(bb);
++ return 0;
++}
+--
+2.31.1
+
diff --git a/for-test/jouranl-deadlock/0001-reserve-journal-space.patch b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch
new file mode 100644
index 0000000..81af639
--- /dev/null
+++ b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch
@@ -0,0 +1,369 @@
+From 120572550c913abcc1054912c8deb29c690ffe93 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 18 Apr 2022 21:55:37 +0800
+Subject: [PATCH 1/2] reserve journal space
+
+---
+ drivers/md/bcache/journal.c | 220 +++++++++++++++++++++++++++++++++---
+ drivers/md/bcache/journal.h | 10 ++
+ 2 files changed, 214 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7c2ca52ca3e4..5aac20c71b80 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -166,6 +166,85 @@ reread: left = ca->sb.bucket_size - offset;
+ return ret;
+ }
+
++static int bch_journal_key_reload(struct cache_set *c)
++{
++ struct cache *ca = c->cache;
++ struct bkey *k = &c->journal.key;
++ struct journal_device *ja = &ca->journal;
++ struct bio *bio = &ja->bio;
++ struct jset *j, *data = c->journal.w[0].data;
++ unsigned int n = 0, offset = 0, used_blocks = 0;
++ unsigned int len, left;
++ sector_t bucket;
++ struct closure cl;
++ int ret = 0;
++
++ /* load from the latest journal bucket */
++ bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]);
++
++ closure_init_stack(&cl);
++
++ while (offset < ca->sb.bucket_size) {
++reread:
++ left = ca->sb.bucket_size - offset;
++ len = min_t(unsigned int,
++ left, PAGE_SECTORS << JSET_BITS);
++
++ bio_reset(bio, ca->bdev, REQ_OP_READ);
++ bio->bi_iter.bi_sector = bucket + offset;
++ bio->bi_iter.bi_size = len << 9;
++
++ bio->bi_end_io = journal_read_endio;
++ bio->bi_private = &cl;
++ bch_bio_map(bio, data);
++
++ closure_bio_submit(c, bio, &cl);
++ closure_sync(&cl);
++
++ j = data;
++ while (len) {
++ size_t blocks, bytes = set_bytes(j);
++
++ if (j->magic != jset_magic(&ca->sb))
++ goto out;
++
++ if (bytes > left << 9 ||
++ bytes > PAGE_SIZE << JSET_BITS) {
++ pr_err("jset may be correpted: too big");
++ ret = -EIO;
++ goto err;
++ }
++
++ if (bytes > len << 9)
++ goto reread;
++
++ if (j->csum != csum_set(j)) {
++ pr_err("jset may be corrupted: bad csum");
++ ret = -EIO;
++ goto err;
++ }
++
++ blocks = set_blocks(j, block_bytes(ca));
++ used_blocks += blocks;
++
++ offset += blocks * ca->sb.block_size;
++ len -= blocks * ca->sb.block_size;
++ j = ((void *) j) + blocks * block_bytes(ca);
++ }
++ }
++out:
++ c->journal.blocks_free =
++ (ca->sb.bucket_size >> c->block_bits) - used_blocks;
++
++ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev);
++
++ bkey_init(k);
++ SET_KEY_PTRS(k, n);
++
++err:
++ return ret;
++}
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b) \
+@@ -279,13 +358,23 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+
+ }
+
++ if (c->journal.blocks_free != 0)
++ pr_warn("Unexpected blocks_free %u before reload journal key.\n",
++ c->journal.blocks_free);
++
++ ret = bch_journal_key_reload(c);
++
+ out:
+ if (!list_empty(list))
+ c->journal.seq = list_entry(list->prev,
+ struct journal_replay,
+ list)->j.seq;
+
+- return 0;
++ /* Initial value of c->journal.blocks_free should be 0 */
++ BUG_ON(c->journal.blocks_free != 0);
++ ret = bch_journal_key_reload(c);
++
++ return ret;
+ #undef read_bucket
+ }
+
+@@ -355,6 +444,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+ struct keylist keylist;
+
++ /* Mark journal replay started */
++ s->journal.in_replay = true;
++
+ list_for_each_entry(i, list, list) {
+ BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+@@ -396,6 +488,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ pr_info("journal replay done, %i keys in %i entries, seq %llu\n",
+ keys, entries, end);
+ err:
++ /* Mark journal replay finished */
++ s->journal.in_replay = false;
++
+ while (!list_empty(list)) {
+ i = list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+@@ -621,6 +716,18 @@ static void do_journal_discard(struct cache *ca)
+ }
+ }
+
++static inline bool last_writable_journal_bucket(struct cache_set *c)
++{
++ struct cache *ca = c->cache;
++ struct journal_device *ja = &ca->journal;
++
++ if (((ja->cur_idx + 1) % ca->sb.njournal_buckets) !=
++ ja->last_idx)
++ return false;
++
++ return true;
++}
++
+ static void journal_reclaim(struct cache_set *c)
+ {
+ struct bkey *k = &c->journal.key;
+@@ -629,6 +736,8 @@ static void journal_reclaim(struct cache_set *c)
+ unsigned int next;
+ struct journal_device *ja = &ca->journal;
+ atomic_t p __maybe_unused;
++ bool is_last_valid;
++ bool journal_wakeup = true;
+
+ atomic_long_inc(&c->reclaim);
+
+@@ -646,13 +755,33 @@ static void journal_reclaim(struct cache_set *c)
+
+ do_journal_discard(ca);
+
+- if (c->journal.blocks_free)
++ is_last_valid = last_writable_journal_bucket(c);
++
++ /*
++ * This is not the last valid journal bucket, no need to worry
++ * about the reserved journal space.
++ */
++ if (!is_last_valid && c->journal.blocks_free)
++ goto out;
++
++ /*
++ * this is the last valid journal bucket, if the free space is
++ * larger than reserved sectors, no need to reclaim more journal
++ * space. Otherwise must try to reclaim one more journal bucket,
++ * to make sure there always are c->journal.reserved sectors
++ * reserved for initialization time usage.
++ */
++ if (is_last_valid &&
++ (c->journal.blocks_free * c->cache->sb.block_size) >
++ c->journal.reserved)
+ goto out;
+
+ next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ /* No space available on this device */
+- if (next == ja->discard_idx)
++ if (next == ja->discard_idx) {
++ journal_wakeup = false;
+ goto out;
++ }
+
+ ja->cur_idx = next;
+ k->ptr[0] = MAKE_PTR(0,
+@@ -665,7 +794,7 @@ static void journal_reclaim(struct cache_set *c)
+ c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits;
+
+ out:
+- if (!journal_full(&c->journal))
++ if (journal_wakeup)
+ __closure_wake_up(&c->journal.wait);
+ }
+
+@@ -825,6 +954,60 @@ static void journal_try_write(struct cache_set *c)
+ }
+ }
+
++static bool jset_space_available(struct cache_set *c, size_t sectors)
++{
++ size_t n, reserved;
++ bool last_writable_bucket;
++
++ n = min_t(size_t,
++ c->journal.blocks_free * c->cache->sb.block_size,
++ PAGE_SECTORS << JSET_BITS);
++
++ last_writable_bucket = last_writable_journal_bucket(c);
++
++ if (!last_writable_bucket || c->journal.in_replay)
++ reserved = 0;
++ else
++ reserved = c->journal.reserved;
++
++ if (sectors <= (n - reserved))
++ return true;
++
++ return false;
++}
++
++static bool journal_space_available(struct cache_set *c,
++ unsigned int nkeys)
++{
++ /*
++ * XXX: If we were inserting so many keys that they
++ * won't fit in an _empty_ journal write, we'll
++ * deadlock. For now, handle this in
++ * bch_keylist_realloc() - but something to think about.
++ */
++ if ((nkeys * sizeof(uint64_t)) >
++ (block_bytes(c->cache) - sizeof(struct jset))) {
++ pr_err("The keys to insert is bigger than an empty journal write.\n");
++ pr_err("keys in current journal write: %u, keys to insert: %u\n",
++ c->journal.cur->data->keys, nkeys);
++ BUG();
++ }
++
++ if (journal_full(&c->journal))
++ return false;
++
++ /*
++ * Before flushing current write (without the inserting keys)
++ * to get next empty write, it is still necessary to check
++ * whether there is enough free blocks in current journal bucket
++ * except for the reserved journal space.
++ */
++ if (jset_space_available(c, 0))
++ return true;
++
++ return false;
++}
++
+ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ unsigned int nkeys)
+ __acquires(&c->journal.lock)
+@@ -844,28 +1027,27 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ block_bytes(ca)) * ca->sb.block_size;
+
+- if (sectors <= min_t(size_t,
+- c->journal.blocks_free * ca->sb.block_size,
+- PAGE_SECTORS << JSET_BITS))
++ if (jset_space_available(c, sectors))
+ return w;
+
+ if (wait)
+ closure_wait(&c->journal.wait, &cl);
+
+- if (!journal_full(&c->journal)) {
+- if (wait)
+- trace_bcache_journal_entry_full(c);
+-
++ if (journal_space_available(c, nkeys)) {
+ /*
+- * XXX: If we were inserting so many keys that they
+- * won't fit in an _empty_ journal write, we'll
+- * deadlock. For now, handle this in
+- * bch_keylist_realloc() - but something to think about.
++ * Flush current non-empty write and try next
++ * empty one updated by journal_write_unlocked().
+ */
+- BUG_ON(!w->data->keys);
++ if (wait)
++ trace_bcache_journal_entry_full(c);
+
+ journal_try_write(c); /* unlocks */
+ } else {
++ /*
++ * No space to flush current write, try to reclaim
++ * an empty journal bucket and do all things again
++ * in next loop.
++ */
+ if (wait)
+ trace_bcache_journal_full(c);
+
+@@ -974,5 +1156,11 @@ int bch_journal_alloc(struct cache_set *c)
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
+ return -ENOMEM;
+
++ /* deside how many sectors reserved for jouranl replay */
++ if (JOURANL_RESERVE < c->cache->sb.bucket_size)
++ j->reserved = JOURANL_RESERVE;
++ else
++ j->reserved = c->cache->sb.bucket_size;
++
+ return 0;
+ }
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..bcaa4ce458ae 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -105,6 +105,7 @@ struct journal {
+ spinlock_t lock;
+ spinlock_t flush_write_lock;
+ bool btree_flushing;
++
+ /* used when waiting because the journal was full */
+ struct closure_waitlist wait;
+ struct closure io;
+@@ -119,6 +120,8 @@ struct journal {
+ BKEY_PADDED(key);
+
+ struct journal_write w[2], *cur;
++ bool in_replay;
++ int reserved;
+ };
+
+ /*
+@@ -161,6 +164,13 @@ struct journal_device {
+ #define journal_pin_cmp(c, l, r) \
+ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+
++/*
++ * Reserve 2 pages space in case journal space is full during
++ * initialization and btree node split happens in journal reply.
++ * If JOURANL_RESERVE > bucket_size, then only reserve 1 bucket.
++ */
++#define JOURANL_RESERVE (PAGE_SECTORS * 2)
++
+ #define JOURNAL_PIN 20000
+
+ #define journal_full(j) \
+--
+2.34.1
+
diff --git a/for-test/jouranl-deadlock/0002-more-fixes.patch b/for-test/jouranl-deadlock/0002-more-fixes.patch
new file mode 100644
index 0000000..c51e16b
--- /dev/null
+++ b/for-test/jouranl-deadlock/0002-more-fixes.patch
@@ -0,0 +1,131 @@
+From df1c455f2b0877ca7dbcec7fa06a0aca8ed825d8 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Apr 2022 16:12:53 +0800
+Subject: [PATCH 2/2] more fixes
+
+---
+ Makefile | 2 +-
+ drivers/md/bcache/journal.c | 15 ++++++++++-----
+ drivers/md/bcache/request.c | 2 +-
+ drivers/md/bcache/super.c | 2 ++
+ drivers/md/bcache/util.c | 10 ++++++++--
+ 5 files changed, 22 insertions(+), 9 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 29e273d3f8cc..3abbd83b337c 100644
+--- a/Makefile
++++ b/Makefile
+@@ -2,7 +2,7 @@
+ VERSION = 5
+ PATCHLEVEL = 18
+ SUBLEVEL = 0
+-EXTRAVERSION = -rc2
++EXTRAVERSION = -rc2-bcache-journal
+ NAME = Superb Owl
+
+ # *DOCUMENTATION*
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 5aac20c71b80..916141c69ec8 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -370,9 +370,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ struct journal_replay,
+ list)->j.seq;
+
+- /* Initial value of c->journal.blocks_free should be 0 */
+- BUG_ON(c->journal.blocks_free != 0);
+- ret = bch_journal_key_reload(c);
++ if (c->journal.blocks_free == 0) {
++ pr_info("c->journal.blocks_free is 0, reload journal_key\n");
++ ret = bch_journal_key_reload(c);
++ }
+
+ return ret;
+ #undef read_bucket
+@@ -900,12 +901,12 @@ static void journal_write_unlocked(struct closure *cl)
+
+ bio_reset(bio, ca->bdev, REQ_OP_WRITE |
+ REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA);
+- bch_bio_map(bio, w->data);
+ bio->bi_iter.bi_sector = PTR_OFFSET(k, i);
+ bio->bi_iter.bi_size = sectors << 9;
+
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = w;
++ bch_bio_map(bio, w->data);
+
+ trace_bcache_journal_write(bio, w->data->keys);
+ bio_list_add(&list, bio);
+@@ -1002,9 +1003,12 @@ static bool journal_space_available(struct cache_set *c,
+ * whether there is enough free blocks in current journal bucket
+ * except for the reserved journal space.
+ */
+- if (jset_space_available(c, 0))
++ if (jset_space_available(c, 0)) {
++ pr_info("there is available jset space\n");
+ return true;
++ }
+
++ pr_info("NO available jset space\n");
+ return false;
+ }
+
+@@ -1027,6 +1031,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ block_bytes(ca)) * ca->sb.block_size;
+
++ pr_info("sectors from __set_blocks(): %lu\n", sectors);
+ if (jset_space_available(c, sectors))
+ return w;
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index fdd0194f84dd..320fcdfef48e 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s,
+ {
+ struct bio *bio = &s->bio.bio;
+
+- bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO);
++ bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO);
+ /*
+ * bi_end_io can be set separately somewhere else, e.g. the
+ * variants in,
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index bf3de149d3c9..efb9fae4354f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1077,7 +1077,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
+ closure_sync(&cl);
+ }
+
++ pr_info("call add_disk(), d->disk: 0x%pK\n", d->disk);
+ ret = add_disk(d->disk);
++ pr_info("return from add_disk(): %d\n", ret);
+ if (ret)
+ goto out;
+ bd_link_disk_holder(dc->bdev, dc->disk.disk);
+diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
+index ae380bc3992e..f3c8b7db43ef 100644
+--- a/drivers/md/bcache/util.c
++++ b/drivers/md/bcache/util.c
+@@ -233,8 +233,14 @@ void bch_bio_map(struct bio *bio, void *base)
+ size_t size = bio->bi_iter.bi_size;
+ struct bio_vec *bv = bio->bi_io_vec;
+
+- BUG_ON(!bio->bi_iter.bi_size);
+- BUG_ON(bio->bi_vcnt);
++ if (!bio->bi_iter.bi_size) {
++ pr_err("BUG: bio->bi_iter.bi_size is 0\n");
++ BUG_ON(!bio->bi_iter.bi_size);
++ }
++ if (bio->bi_vcnt) {
++ pr_err("BUG: bio->bi_vcnt: %u\n", bio->bi_vcnt);
++ BUG_ON(bio->bi_vcnt);
++ }
+
+ bv->bv_offset = base ? offset_in_page(base) : 0;
+ goto start;
+--
+2.34.1
+
diff --git a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
index cfe5323..cfe5323 100644
--- a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
diff --git a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
index 39b9873..39b9873 100644
--- a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
diff --git a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
index 07050e9..07050e9 100644
--- a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch