update for-test and for-next

author: Coly Li <colyli@suse.de> 2022-05-22 00:50:52 +0800
committer: Coly Li <colyli@suse.de> 2022-05-22 00:50:52 +0800
commit: 41347a6d6406e1297ae11c7eb003c0b284a25720 (patch)
tree: 8cb4c47e1ed9ed66babe5ebde0d684b4f0c92145
parent: 995eb52153c879646c1dedb21ff4d2683aa4966d (diff)
download: bcache-patches-41347a6d6406e1297ae11c7eb003c0b284a25720.tar.gz
70 files changed, 18939 insertions, 16 deletions
diff --git a/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch
new file mode 100644
index 0000000..9fb59df
--- /dev/null
+++ b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch
@@ -0,0 +1,140 @@
+From ead990f754571c9492943b437014abab6894955c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 13:08:58 +0800
+Subject: [PATCH 1/4] bcache: improve multithreaded bch_btree_check()
+
+Commit 8e7102273f59 ("bcache: make bch_btree_check() to be
+multithreaded") makes bch_btree_check() to be much faster when checking
+all btree nodes during cache device registration. But it isn't in ideal
+shap yet, still can be improved.
+
+This patch does the following thing to improve current parallel btree
+nodes check by multiple threads in bch_btree_check(),
+- Add read lock to root node while checking all the btree nodes with
+  multiple threads. Although currently it is not mandatory but it is
+  good to have a read lock in code logic.
+- Remove local variable 'char name[32]', and generate kernel thread name
+  string directly when calling kthread_run().
+- Allocate local variable "struct btree_check_state check_state" on the
+  stack and avoid unnecessary dynamic memory allocation for it.
+- Increase check_state->started to count created kernel thread after it
+  succeeds to create.
+- When wait for all checking kernel threads to finish, use wait_event()
+  to replace wait_event_interruptible().
+
+With this change, the code is more clear, and some potential error
+conditions are avoided.
+
+Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/btree.c | 58 ++++++++++++++++++---------------------
+ 1 file changed, 26 insertions(+), 32 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index ad9f16689419..2362bb8ef6d1 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c)
+ 	int i;
+ 	struct bkey *k = NULL;
+ 	struct btree_iter iter;
+-	struct btree_check_state *check_state;
+-	char name[32];
++	struct btree_check_state check_state;
+ 
+ 	/* check and mark root node keys */
+ 	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+@@ -2018,63 +2017,58 @@ int bch_btree_check(struct cache_set *c)
+ 	if (c->root->level == 0)
+ 		return 0;
+ 
+-	check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
+-	if (!check_state)
+-		return -ENOMEM;
+-
+-	check_state->c = c;
+-	check_state->total_threads = bch_btree_chkthread_nr();
+-	check_state->key_idx = 0;
+-	spin_lock_init(&check_state->idx_lock);
+-	atomic_set(&check_state->started, 0);
+-	atomic_set(&check_state->enough, 0);
+-	init_waitqueue_head(&check_state->wait);
++	check_state.c = c;
++	check_state.total_threads = bch_btree_chkthread_nr();
++	check_state.key_idx = 0;
++	spin_lock_init(&check_state.idx_lock);
++	atomic_set(&check_state.started, 0);
++	atomic_set(&check_state.enough, 0);
++	init_waitqueue_head(&check_state.wait);
+ 
++	rw_lock(0, c->root, c->root->level);
+ 	/*
+ 	 * Run multiple threads to check btree nodes in parallel,
+-	 * if check_state->enough is non-zero, it means current
++	 * if check_state.enough is non-zero, it means current
+ 	 * running check threads are enough, unncessary to create
+ 	 * more.
+ 	 */
+-	for (i = 0; i < check_state->total_threads; i++) {
+-		/* fetch latest check_state->enough earlier */
++	for (i = 0; i < check_state.total_threads; i++) {
++		/* fetch latest check_state.enough earlier */
+ 		smp_mb__before_atomic();
+-		if (atomic_read(&check_state->enough))
++		if (atomic_read(&check_state.enough))
+ 			break;
+ 
+-		check_state->infos[i].result = 0;
+-		check_state->infos[i].state = check_state;
+-		snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
+-		atomic_inc(&check_state->started);
++		check_state.infos[i].result = 0;
++		check_state.infos[i].state = &check_state;
+ 
+-		check_state->infos[i].thread =
++		check_state.infos[i].thread =
+ 			kthread_run(bch_btree_check_thread,
+-				    &check_state->infos[i],
+-				    name);
+-		if (IS_ERR(check_state->infos[i].thread)) {
++				    &check_state.infos[i],
++				    "bch_btrchk[%d]", i);
++		if (IS_ERR(check_state.infos[i].thread)) {
+ 			pr_err("fails to run thread bch_btrchk[%d]\n", i);
+ 			for (--i; i >= 0; i--)
+-				kthread_stop(check_state->infos[i].thread);
++				kthread_stop(check_state.infos[i].thread);
+ 			ret = -ENOMEM;
+ 			goto out;
+ 		}
++		atomic_inc(&check_state.started);
+ 	}
+ 
+ 	/*
+ 	 * Must wait for all threads to stop.
+ 	 */
+-	wait_event_interruptible(check_state->wait,
+-				 atomic_read(&check_state->started) == 0);
++	wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
+ 
+-	for (i = 0; i < check_state->total_threads; i++) {
+-		if (check_state->infos[i].result) {
+-			ret = check_state->infos[i].result;
++	for (i = 0; i < check_state.total_threads; i++) {
++		if (check_state.infos[i].result) {
++			ret = check_state.infos[i].result;
+ 			goto out;
+ 		}
+ 	}
+ 
+ out:
+-	kfree(check_state);
++	rw_unlock(0, c->root);
+ 	return ret;
+ }
+ 
+-- 
+2.35.3
+
diff --git a/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch
new file mode 100644
index 0000000..2a05768
--- /dev/null
+++ b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch
@@ -0,0 +1,132 @@
+From 7ff9ba24404e797a53fd44ae4c21b2234d46ca39 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 14:14:17 +0800
+Subject: [PATCH 2/4] bcache: improve multithreaded bch_sectors_dirty_init()
+
+Commit b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be
+multithreaded") makes bch_sectors_dirty_init() to be much faster
+when counting dirty sectors by iterating all dirty keys in the btree.
+But it isn't in ideal shape yet, still can be improved.
+
+This patch does the following changes to improve current parallel dirty
+keys iteration on the btree,
+- Add read lock to root node when multiple threads iterating the btree,
+  to prevent the root node gets split by I/Os from other registered
+  bcache devices.
+- Remove local variable "char name[32]" and generate kernel thread name
+  string directly when calling kthread_run().
+- Allocate "struct bch_dirty_init_state state" directly on stack and
+  avoid the unnecessary dynamic memory allocation for it.
+- Increase &state->started to count created kernel thread after it
+  succeeds to create.
+- When wait for all dirty key counting threads to finish, use
+  wait_event() to replace wait_event_interruptible().
+
+With the above changes, the code is more clear, and some potential error
+conditions are avoided.
+
+Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/writeback.c | 62 ++++++++++++++---------------------
+ 1 file changed, 25 insertions(+), 37 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 9ee0005874cd..d24c09490f8e 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -948,10 +948,10 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ 	struct btree_iter iter;
+ 	struct sectors_dirty_init op;
+ 	struct cache_set *c = d->c;
+-	struct bch_dirty_init_state *state;
+-	char name[32];
++	struct bch_dirty_init_state state;
+ 
+ 	/* Just count root keys if no leaf node */
++	rw_lock(0, c->root, c->root->level);
+ 	if (c->root->level == 0) {
+ 		bch_btree_op_init(&op.op, -1);
+ 		op.inode = d->id;
+@@ -961,54 +961,42 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ 		for_each_key_filter(&c->root->keys,
+ 				    k, &iter, bch_ptr_invalid)
+ 			sectors_dirty_init_fn(&op.op, c->root, k);
++		rw_unlock(0, c->root);
+ 		return;
+ 	}
+ 
+-	state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL);
+-	if (!state) {
+-		pr_warn("sectors dirty init failed: cannot allocate memory\n");
+-		return;
+-	}
+-
+-	state->c = c;
+-	state->d = d;
+-	state->total_threads = bch_btre_dirty_init_thread_nr();
+-	state->key_idx = 0;
+-	spin_lock_init(&state->idx_lock);
+-	atomic_set(&state->started, 0);
+-	atomic_set(&state->enough, 0);
+-	init_waitqueue_head(&state->wait);
+-
+-	for (i = 0; i < state->total_threads; i++) {
+-		/* Fetch latest state->enough earlier */
++	state.c = c;
++	state.d = d;
++	state.total_threads = bch_btre_dirty_init_thread_nr();
++	state.key_idx = 0;
++	spin_lock_init(&state.idx_lock);
++	atomic_set(&state.started, 0);
++	atomic_set(&state.enough, 0);
++	init_waitqueue_head(&state.wait);
++
++	for (i = 0; i < state.total_threads; i++) {
++		/* Fetch latest state.enough earlier */
+ 		smp_mb__before_atomic();
+-		if (atomic_read(&state->enough))
++		if (atomic_read(&state.enough))
+ 			break;
+ 
+-		state->infos[i].state = state;
+-		atomic_inc(&state->started);
+-		snprintf(name, sizeof(name), "bch_dirty_init[%d]", i);
+-
+-		state->infos[i].thread =
+-			kthread_run(bch_dirty_init_thread,
+-				    &state->infos[i],
+-				    name);
+-		if (IS_ERR(state->infos[i].thread)) {
++		state.infos[i].state = &state;
++		state.infos[i].thread =
++			kthread_run(bch_dirty_init_thread, &state.infos[i],
++				    "bch_dirtcnt[%d]", i);
++		if (IS_ERR(state.infos[i].thread)) {
+ 			pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ 			for (--i; i >= 0; i--)
+-				kthread_stop(state->infos[i].thread);
++				kthread_stop(state.infos[i].thread);
+ 			goto out;
+ 		}
++		atomic_inc(&state.started);
+ 	}
+ 
+-	/*
+-	 * Must wait for all threads to stop.
+-	 */
+-	wait_event_interruptible(state->wait,
+-		 atomic_read(&state->started) == 0);
+-
+ out:
+-	kfree(state);
++	/* Must wait for all threads to stop. */
++	wait_event(state.wait, atomic_read(&state.started) == 0);
++	rw_unlock(0, c->root);
+ }
+ 
+ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+-- 
+2.35.3
+
diff --git a/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch
new file mode 100644
index 0000000..b11b7d4
--- /dev/null
+++ b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch
@@ -0,0 +1,138 @@
+From 8ffcbccd25f7f3edd157e9e2aa78e9b158bebb9b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 14:46:03 +0800
+Subject: [PATCH 3/4] bcache: remove incremental dirty sector counting for
+ bch_sectors_dirty_init()
+
+After making bch_sectors_dirty_init() being multithreaded, the existing
+incremental dirty sector counting in bch_root_node_dirty_init() doesn't
+release btree occupation after iterating 500000 (INIT_KEYS_EACH_TIME)
+bkeys. Because a read lock is added on btree root node to prevent the
+btree to be split during the dirty sectors counting, other I/O requester
+has no chance to gain the write lock even restart bcache_btree().
+
+That is to say, the incremental dirty sectors counting is incompatible
+to the multhreaded bch_sectors_dirty_init(). We have to choose one and
+drop another one.
+
+In my testing, with 512 bytes random writes, I generate 1.2T dirty data
+and a btree with 400K nodes. With single thread and incremental dirty
+sectors counting, it takes 30+ minites to register the backing device.
+And with multithreaded dirty sectors counting, the backing device
+registration can be accomplished within 2 minutes.
+
+The 30+ minutes V.S. 2- minutes difference makes me decide to keep
+multithreaded bch_sectors_dirty_init() and drop the incremental dirty
+sectors counting. This is what this patch does.
+
+But INIT_KEYS_EACH_TIME is kept, in sectors_dirty_init_fn() the CPU
+will be released by cond_resched() after every INIT_KEYS_EACH_TIME keys
+iterated. This is to avoid the watchdog reports a bogus soft lockup
+warning.
+
+Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded")
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/writeback.c | 41 +++++++++++------------------------
+ 1 file changed, 13 insertions(+), 28 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index d24c09490f8e..75b71199800d 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -805,13 +805,11 @@ static int bch_writeback_thread(void *arg)
+ 
+ /* Init */
+ #define INIT_KEYS_EACH_TIME	500000
+-#define INIT_KEYS_SLEEP_MS	100
+ 
+ struct sectors_dirty_init {
+ 	struct btree_op	op;
+ 	unsigned int	inode;
+ 	size_t		count;
+-	struct bkey	start;
+ };
+ 
+ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+@@ -827,11 +825,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+ 					     KEY_START(k), KEY_SIZE(k));
+ 
+ 	op->count++;
+-	if (atomic_read(&b->c->search_inflight) &&
+-	    !(op->count % INIT_KEYS_EACH_TIME)) {
+-		bkey_copy_key(&op->start, k);
+-		return -EAGAIN;
+-	}
++	if (!(op->count % INIT_KEYS_EACH_TIME))
++		cond_resched();
+ 
+ 	return MAP_CONTINUE;
+ }
+@@ -846,24 +841,16 @@ static int bch_root_node_dirty_init(struct cache_set *c,
+ 	bch_btree_op_init(&op.op, -1);
+ 	op.inode = d->id;
+ 	op.count = 0;
+-	op.start = KEY(op.inode, 0, 0);
+-
+-	do {
+-		ret = bcache_btree(map_keys_recurse,
+-				   k,
+-				   c->root,
+-				   &op.op,
+-				   &op.start,
+-				   sectors_dirty_init_fn,
+-				   0);
+-		if (ret == -EAGAIN)
+-			schedule_timeout_interruptible(
+-				msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+-		else if (ret < 0) {
+-			pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+-			break;
+-		}
+-	} while (ret == -EAGAIN);
++
++	ret = bcache_btree(map_keys_recurse,
++			   k,
++			   c->root,
++			   &op.op,
++			   &KEY(op.inode, 0, 0),
++			   sectors_dirty_init_fn,
++			   0);
++	if (ret < 0)
++		pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+ 
+ 	return ret;
+ }
+@@ -907,7 +894,6 @@ static int bch_dirty_init_thread(void *arg)
+ 				goto out;
+ 			}
+ 			skip_nr--;
+-			cond_resched();
+ 		}
+ 
+ 		if (p) {
+@@ -917,7 +903,6 @@ static int bch_dirty_init_thread(void *arg)
+ 
+ 		p = NULL;
+ 		prev_idx = cur_idx;
+-		cond_resched();
+ 	}
+ 
+ out:
+@@ -956,11 +941,11 @@ void bch_sectors_dirty_init(struct bcache_device *d)
+ 		bch_btree_op_init(&op.op, -1);
+ 		op.inode = d->id;
+ 		op.count = 0;
+-		op.start = KEY(op.inode, 0, 0);
+ 
+ 		for_each_key_filter(&c->root->keys,
+ 				    k, &iter, bch_ptr_invalid)
+ 			sectors_dirty_init_fn(&op.op, c->root, k);
++
+ 		rw_unlock(0, c->root);
+ 		return;
+ 	}
+-- 
+2.35.3
+
diff --git a/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch
new file mode 100644
index 0000000..aabe732
--- /dev/null
+++ b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch
@@ -0,0 +1,148 @@
+From 27029e1e8f064bc8541308c807d3ee579d86811d Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 21 May 2022 22:55:46 +0800
+Subject: [PATCH 4/4] bcache: avoid journal no-space deadlock by reserving 1
+ journal bucket
+
+The journal no-space deadlock was reported time to time. Such deadlock
+can happen in the following situation.
+
+When all journal buckets are fully filled by active jset with heavy
+write I/O load, the cache set registration (after a reboot) will load
+all active jsets and inserting them into the btree again (which is
+called journal replay). If a journaled bkey is inserted into a btree
+node and results btree node split, new journal request might be
+triggered. For example, the btree grows one more level after the node
+split, then the root node record in cache device super block will be
+upgrade by bch_journal_meta() from bch_btree_set_root(). But there is no
+space in journal buckets, the journal replay has to wait for new journal
+bucket to be reclaimed after at least one journal bucket replayed. This
+is one example that how the journal no-space deadlock happens.
+
+The solution to avoid the deadlock is to reserve 1 journal bucket in
+run time, and only permit the reserved journal bucket to be used during
+cache set registration procedure for things like journal replay. Then
+the journal space will never be fully filled, there is no chance for
+journal no-space deadlock to happen anymore.
+
+This patch adds a new member "bool do_reserve" in struct journal, it is
+inititalized to 0 (false) when struct journal is allocated, and set to
+1 (true) by bch_journal_space_reserve() when all initialization done in
+run_cache_set(). In the run time when journal_reclaim() tries to
+allocate a new journal bucket, free_journal_buckets() is called to check
+whether there are enough free journal buckets to use. If there is only
+1 free journal bucket and journal->do_reserve is 1 (true), the last
+bucket is reserved and free_journal_buckets() will return 0 to indicate
+no free journal bucket. Then journal_reclaim() will give up, and try
+next time to see whetheer there is free journal bucket to allocate. By
+this method, there is always 1 jouranl bucket reserved in run time.
+
+During the cache set registration, journal->do_reserve is 0 (false), so
+the reserved journal bucket can be used to avoid the no-space deadlock.
+
+Reported-by: Nikhil Kshirsagar <nkshirsagar@gmail.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: stable@vger.kernel.org
+---
+ drivers/md/bcache/journal.c | 31 ++++++++++++++++++++++++++-----
+ drivers/md/bcache/journal.h |  2 ++
+ drivers/md/bcache/super.c   |  1 +
+ 3 files changed, 29 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index df5347ea450b..e5da469a4235 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -405,6 +405,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ 	return ret;
+ }
+ 
++void bch_journal_space_reserve(struct journal *j)
++{
++	j->do_reserve = true;
++}
++
+ /* Journalling */
+ 
+ static void btree_flush_write(struct cache_set *c)
+@@ -621,12 +626,30 @@ static void do_journal_discard(struct cache *ca)
+ 	}
+ }
+ 
++static unsigned int free_journal_buckets(struct cache_set *c)
++{
++	struct journal *j = &c->journal;
++	struct cache *ca = c->cache;
++	struct journal_device *ja = &c->cache->journal;
++	unsigned int n;
++
++	/* In case njournal_buckets is not power of 2 */
++	if (ja->cur_idx >= ja->discard_idx)
++		n = ca->sb.njournal_buckets +  ja->discard_idx - ja->cur_idx;
++	else
++		n = ja->discard_idx - ja->cur_idx;
++
++	if (n > (1 + j->do_reserve))
++		return n - (1 + j->do_reserve);
++
++	return 0;
++}
++
+ static void journal_reclaim(struct cache_set *c)
+ {
+ 	struct bkey *k = &c->journal.key;
+ 	struct cache *ca = c->cache;
+ 	uint64_t last_seq;
+-	unsigned int next;
+ 	struct journal_device *ja = &ca->journal;
+ 	atomic_t p __maybe_unused;
+ 
+@@ -649,12 +672,10 @@ static void journal_reclaim(struct cache_set *c)
+ 	if (c->journal.blocks_free)
+ 		goto out;
+ 
+-	next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+-	/* No space available on this device */
+-	if (next == ja->discard_idx)
++	if (!free_journal_buckets(c))
+ 		goto out;
+ 
+-	ja->cur_idx = next;
++	ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ 	k->ptr[0] = MAKE_PTR(0,
+ 			     bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+ 			     ca->sb.nr_this_dev);
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..cd316b4a1e95 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -105,6 +105,7 @@ struct journal {
+ 	spinlock_t		lock;
+ 	spinlock_t		flush_write_lock;
+ 	bool			btree_flushing;
++	bool			do_reserve;
+ 	/* used when waiting because the journal was full */
+ 	struct closure_waitlist	wait;
+ 	struct closure		io;
+@@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+ 
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
++void bch_journal_space_reserve(struct journal *j);
+ 
+ #endif /* _BCACHE_JOURNAL_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index bf3de149d3c9..2bb55278d22d 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2128,6 +2128,7 @@ static int run_cache_set(struct cache_set *c)
+ 
+ 	flash_devs_run(c);
+ 
++	bch_journal_space_reserve(&c->journal);
+ 	set_bit(CACHE_SET_RUNNING, &c->flags);
+ 	return 0;
+ err:
+-- 
+2.35.3
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..fba652d
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From d5ca176bc66727740baa4c80ba1349ba25dc95f7 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH 01/13] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+  This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+  This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+  This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+  This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's alloction record list which is headed by the above
+struct bch_nvmpg_head.
+
+- struct bch_nvmpg_recs
+  This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+  - Bits  0 - 51: pgoff - is pages offset of the allocated pages.
+  - Bits 52 - 57: order - allocaed size in page_size * order-of-2
+  - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+  - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special fomat.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..e9eb6371fd78
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ *   This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ *   This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ *   This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ *   This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's alloction record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ *   This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ *      |        bch_nvmpg_header         |
++ *      |                                 |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ *      |         bch_nvmpg_recs          |
++ *      |  (nvm pages internal usage)     |
++ * 24KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ *   All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ *       [highest 3 bits: ns_id]
++ *       [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ *    highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ *    address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET		4096
++#define BCH_NVMPG_START			(16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE		32
++#define BCH_NVMPG_NS_MAX		8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET	(8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET	(16<<10)
++
++#define BCH_NVMPG_SB_VERSION		0
++#define BCH_NVMPG_SB_VERSION_MAX	0
++
++static const __u8 bch_nvmpg_magic[] = {
++	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++	union {
++		struct {
++			__u64	pgoff:52;
++			__u64	order:6;
++			__u64	ns_id:3;
++			__u64	reserved:3;
++		};
++		__u64	_v;
++	};
++};
++
++struct bch_nvmpg_recs {
++	union {
++		struct {
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvmpg_head
++			 */
++			__u64			head_offset;
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvm_pgalloc_recs which contains
++			 * the next recs[] array.
++			 */
++			__u64			next_offset;
++			__u8			magic[16];
++			__u8			uuid[16];
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_rec	recs[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_RECS				\
++	((sizeof(struct bch_nvmpg_recs) -		\
++	  offsetof(struct bch_nvmpg_recs, recs)) /	\
++	 sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE		0x0
++#define BCH_NVMPG_HD_STAT_ALLOC		0x1
++struct bch_nvmpg_head {
++	__u8		uuid[16];
++	__u8		label[BCH_NVMPG_LBL_SIZE];
++	__u32		state;
++	__u32		flags;
++	/*
++	 * Array of offset values from the nvmpg offset format
++	 * pointers, each of the pointer points to a per-namespace
++	 * struct bch_nvmpg_recs.
++	 */
++	__u64		recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++	union {
++		struct {
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_head	heads[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_HEADS					\
++	((sizeof(struct bch_nvmpg_set_header) -			\
++	  offsetof(struct bch_nvmpg_set_header, heads)) /	\
++	 sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++	__u64			csum;
++	__u64			sb_offset;
++	__u64			ns_start;
++	__u64			version;
++	__u8			magic[16];
++	__u8			uuid[16];
++	__u32			page_size;
++	__u32			total_ns;
++	__u32			this_ns;
++	union {
++		__u8		set_uuid[16];
++		__u64		set_magic;
++	};
++
++	__u64			flags;
++	__u64			seq;
++
++	__u64			feature_compat;
++	__u64			feature_incompat;
++	__u64			feature_ro_compat;
++
++	/* For allocable nvm pages from buddy systems */
++	__u64			pages_offset;
++	__u64			pages_total;
++
++	__u64			pad[8];
++
++	/*
++	 * A nvmpg offset format pointer, it points
++	 * to struct bch_nvmpg_set_header which is
++	 * stored only on the first name space.
++	 */
++	__u64			set_header_offset;
++
++	/* Just for csum_set() */
++	__u32			keys;
++	__u64			d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..485a6e0
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,543 @@
+From d0a096b054485476b6788ae2a071c036dcffc248 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH 02/13] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig  |  10 ++
+ drivers/md/bcache/Makefile |   1 +
+ drivers/md/bcache/nvmpg.c  | 341 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h  |  97 +++++++++++
+ drivers/md/bcache/super.c  |   3 +
+ 5 files changed, 452 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ 	device path into this file will returns immediately and the real
+ 	registration work is handled in kernel work queue in asynchronous
+ 	way.
++
++config BCACHE_NVM_PAGES
++	bool "NVDIMM support for bcache (EXPERIMENTAL)"
++	depends on BCACHE
++	depends on 64BIT
++	depends on LIBNVDIMM
++	depends on DAX
++	help
++	  Allocate/release NV-memory pages for bcache and provide allocated pages
++	  for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
+ bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+ 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ 	util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..be006a91e8bb
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,341 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (offset == 0)
++		return NULL;
++
++	ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (ns)
++		return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++	pr_err("Invalid ns_id %u\n", ns_id);
++	return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++	int i;
++	struct bch_nvmpg_ns *ns;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		ns = set->ns_tbl[i];
++		if (ns) {
++			fs_put_dax(ns->dax_dev);
++			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
++			set->ns_tbl[i] = NULL;
++			set->attached_ns--;
++			kfree(ns);
++		}
++	}
++
++	if (set->attached_ns)
++		pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++	release_ns_tbl(set);
++	kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_set_header *set_header;
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++		       ns->ns_id);
++		return -EINVAL;
++	}
++
++	set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++	mutex_lock(&global_nvmpg_set->lock);
++	global_nvmpg_set->set_header = set_header;
++	global_nvmpg_set->heads_size = set_header->size;
++	global_nvmpg_set->heads_used = set_header->used;
++	mutex_unlock(&global_nvmpg_set->lock);
++
++	return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_sb *sb = ns->sb;
++	int rc = 0;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++		pr_err("ns_id %u already attached.\n", ns->ns_id);
++		rc = -EEXIST;
++		goto unlock;
++	}
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	if (global_nvmpg_set->attached_ns > 0) {
++		pr_err("multiple namespace attaching not supported yet\n");
++		rc = -EOPNOTSUPP;
++		goto unlock;
++	}
++
++	if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++		pr_err("namespace counters error: attached %u > total %u\n",
++		       global_nvmpg_set->attached_ns,
++		       global_nvmpg_set->total_ns);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++	global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++	global_nvmpg_set->attached_ns++;
++	global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++				  struct bch_nvmpg_ns *ns)
++{
++	struct page *page;
++	struct bch_nvmpg_sb *sb;
++	uint64_t expected_csum = 0;
++	int r;
++
++	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++				BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++	if (IS_ERR(page))
++		return -EIO;
++
++	sb = (struct bch_nvmpg_sb *)
++	     (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++	r = -EINVAL;
++	expected_csum = csum_set(sb);
++	if (expected_csum != sb->csum) {
++		pr_info("csum is not match with expected one\n");
++		goto put_page;
++	}
++
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_info("invalid bch_nvmpg_magic\n");
++		goto put_page;
++	}
++
++	if (sb->sb_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++		pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++		goto put_page;
++	}
++
++	r = -EOPNOTSUPP;
++	if (sb->total_ns != 1) {
++		pr_info("multiple name space not supported yet.\n");
++		goto put_page;
++	}
++
++
++	r = 0;
++	/* Necessary for DAX mapping */
++	ns->page_size = sb->page_size;
++	ns->pages_total = sb->pages_total;
++
++put_page:
++	put_page(page);
++	return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	struct bch_nvmpg_ns *ns = NULL;
++	struct bch_nvmpg_sb *sb = NULL;
++	char buf[BDEVNAME_SIZE];
++	struct block_device *bdev;
++	pgoff_t pgoff;
++	int id, err;
++	char *path;
++	long dax_ret = 0;
++
++	path = kstrndup(dev_path, 512, GFP_KERNEL);
++	if (!path) {
++		pr_err("kstrndup failed\n");
++		return ERR_PTR(-ENOMEM);
++	}
++
++	bdev = blkdev_get_by_path(strim(path),
++				  FMODE_READ|FMODE_WRITE|FMODE_EXEC,
++				  global_nvmpg_set);
++	if (IS_ERR(bdev)) {
++		pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++		kfree(path);
++		return ERR_PTR(PTR_ERR(bdev));
++	}
++
++	err = -ENOMEM;
++	ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++	if (!ns)
++		goto bdput;
++
++	err = -EIO;
++	if (read_nvdimm_meta_super(bdev, ns)) {
++		pr_err("%s read nvdimm meta super block failed.\n",
++		       bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EOPNOTSUPP;
++	ns->dax_dev = fs_dax_get_by_bdev(bdev);
++	if (!ns->dax_dev) {
++		pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++			   bdev_nr_sectors(bdev))) {
++		pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++		pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	id = dax_read_lock();
++	dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++				    &ns->base_addr, &ns->start_pfn);
++	if (dax_ret <= 0) {
++		pr_err("dax_direct_access error\n");
++		dax_read_unlock(id);
++		goto free_ns;
++	}
++
++	if (dax_ret < ns->pages_total) {
++		pr_warn("mapped range %ld is less than ns->pages_total %lu\n",
++			dax_ret, ns->pages_total);
++	}
++	dax_read_unlock(id);
++
++	sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++	err = -EINVAL;
++	/* Check magic again to make sure DAX mapping is correct */
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++		goto free_ns;
++	}
++
++	if ((global_nvmpg_set->attached_ns > 0) &&
++	     memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++		pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++		goto free_ns;
++	}
++
++	if (sb->set_header_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++		pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++		       sb->this_ns,
++		       BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++		       BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++		goto free_ns;
++	}
++
++	ns->page_size = sb->page_size;
++	ns->pages_offset = sb->pages_offset;
++	ns->pages_total = sb->pages_total;
++	ns->sb = sb;
++	ns->free = 0;
++	ns->bdev = bdev;
++	ns->set = global_nvmpg_set;
++
++	err = attach_nvmpg_set(ns);
++	if (err < 0)
++		goto free_ns;
++
++	mutex_init(&ns->lock);
++
++	err = init_nvmpg_set_header(ns);
++	if (err < 0)
++		goto free_ns;
++
++	kfree(path);
++	return ns;
++
++free_ns:
++	fs_put_dax(ns->dax_dev);
++	kfree(ns);
++bdput:
++	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
++	kfree(path);
++	return ERR_PTR(err);
++}
++EXPORT_SYMBOL_GPL(bch_register_namespace);
++
++int __init bch_nvmpg_init(void)
++{
++	global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++	if (!global_nvmpg_set)
++		return -ENOMEM;
++
++	global_nvmpg_set->total_ns = 0;
++	mutex_init(&global_nvmpg_set->lock);
++
++	pr_info("bcache nvm init\n");
++	return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++	release_nvmpg_set(global_nvmpg_set);
++	pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++	struct bch_nvmpg_sb *sb;
++	void *base_addr;
++
++	unsigned char uuid[16];
++	int ns_id;
++	unsigned int page_size;
++	unsigned long free;
++	unsigned long pages_offset;
++	unsigned long pages_total;
++	pfn_t start_pfn;
++
++	struct dax_device *dax_dev;
++	struct block_device *bdev;
++	struct bch_nvmpg_set *set;
++
++	struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++	unsigned char set_uuid[16];
++
++	int heads_size;
++	int heads_used;
++	struct bch_nvmpg_set_header *set_header;
++
++	struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++	int total_ns;
++	int attached_ns;
++
++	struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS	3
++#define BCH_NVMPG_OFFSET_BITS	61
++#define BCH_NVMPG_NS_ID_MASK	((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK	((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset)					\
++	(((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset)	((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset)					\
++	((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) |	\
++	 ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++	return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index dc35f6e1d8d3..841d08e50191 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+ 
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2811,6 +2812,7 @@ static void bcache_exit(void)
+ {
+ 	bch_debug_exit();
+ 	bch_request_exit();
++	bch_nvmpg_exit();
+ 	if (bcache_kobj)
+ 		kobject_put(bcache_kobj);
+ 	if (bcache_wq)
+@@ -2909,6 +2911,7 @@ static int __init bcache_init(void)
+ 
+ 	bch_debug_init();
+ 	closure_debug_init();
++	bch_nvmpg_init();
+ 
+ 	bcache_is_reboot = false;
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..395f285
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From c9977c3fd9e238ac5a8a684de5a8dc5c8a4462e2 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH 03/13] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+	struct {        /* ZONE_DEVICE pages */
+		/** @pgmap: Points to the hosting device page map. */
+		struct dev_pagemap *pgmap;
+		void *zone_device_data;
+		/*
+		 * ZONE_DEVICE private pages are counted as being
+		 * mapped so the next 3 words hold the mapping, index,
+		 * and private fields from the source anonymous or
+		 * page cache page while the page is migrated to device
+		 * private memory.
+		 * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+		 * use the mapping, index, and private fields when
+		 * pmem backed DAX files are mapped.
+		 */
+	};
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h |  12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index be006a91e8bb..b51073588f65 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ 	return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+ 
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++	return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++	return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++	pgoff_t pgoff = r->pgoff;
++
++	return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++				       pgoff_t pgoff, u64 nr)
++{
++	while (nr > 0) {
++		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++		bitmap_set(ns->pages_bitmap, pgoff, num);
++		nr -= num;
++		pgoff += num;
++	}
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ 	int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ 	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ 		ns = set->ns_tbl[i];
+ 		if (ns) {
++			kvfree(ns->pages_bitmap);
++			if (ns->recs_bitmap)
++				bitmap_free(ns->recs_bitmap);
++
+ 			fs_put_dax(ns->dax_dev);
+ 			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+ 			set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ 	kfree(set);
+ }
+ 
++static int validate_recs(int ns_id,
++			 struct bch_nvmpg_head *head,
++			 struct bch_nvmpg_recs *recs)
++{
++	if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++		pr_err("Invalid bch_nvmpg_recs magic\n");
++		return -EINVAL;
++	}
++
++	if (memcmp(recs->uuid, head->uuid, 16)) {
++		pr_err("Invalid bch_nvmpg_recs uuid\n");
++		return -EINVAL;
++	}
++
++	if (recs->head_offset !=
++	    bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++		pr_err("Invalid recs head_offset\n");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++	int i, used = 0;
++
++	for (i = 0; i < recs->size; i++) {
++		struct bch_nvmpg_rec *r = &recs->recs[i];
++		struct bch_nvmpg_ns *ns;
++		struct page *page;
++		void *addr;
++
++		if (r->pgoff == 0)
++			continue;
++
++		ns = global_nvmpg_set->ns_tbl[r->ns_id];
++		addr = bch_nvmpg_rec_to_ptr(r);
++		if (addr < ns->base_addr) {
++			pr_err("Invalid recorded address\n");
++			return -EINVAL;
++		}
++
++		/* init struct page: index/private */
++		page = bch_nvmpg_va_to_pg(addr);
++		set_page_private(page, r->order);
++		page->index = r->pgoff;
++
++		reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++		used++;
++	}
++
++	if (used != recs->used) {
++		pr_err("used %d doesn't match recs->used %d\n",
++		       used, recs->used);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_set_header *set_header;
++	struct bch_nvmpg_recs *sys_recs;
++	int i, j, used = 0, rc = 0;
+ 
+ 	if (ns->ns_id != 0) {
+ 		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	global_nvmpg_set->set_header = set_header;
+ 	global_nvmpg_set->heads_size = set_header->size;
+ 	global_nvmpg_set->heads_used = set_header->used;
++
++	/* Reserve the used space from buddy allocator */
++	reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++	sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *head;
++
++		head = &set_header->heads[i];
++		if (head->state == BCH_NVMPG_HD_STAT_FREE)
++			continue;
++
++		used++;
++		if (used > global_nvmpg_set->heads_size) {
++			pr_err("used heads %d > heads size %d.\n",
++			       used, global_nvmpg_set->heads_size);
++			goto unlock;
++		}
++
++		for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++			/* Iterate the recs list */
++			while (recs) {
++				rc = validate_recs(j, head, recs);
++				if (rc < 0)
++					goto unlock;
++
++				rc = reserve_nvmpg_recs(recs);
++				if (rc < 0)
++					goto unlock;
++
++				bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++				recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++			}
++		}
++	}
++unlock:
+ 	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
+ 
+-	return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++	unsigned int start, end, pages;
++	int i;
++	struct page *page;
++	pgoff_t pgoff_start;
++
++	bitmap_for_each_clear_region(ns->pages_bitmap,
++				     start, end, 0, ns->pages_total) {
++		pgoff_start = start;
++		pages = end - start;
++
++		while (pages) {
++			void *addr;
++
++			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++				if ((pgoff_start % (1L << i) == 0) &&
++				    (pages >= (1L << i)))
++					break;
++			}
++
++			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++			page = bch_nvmpg_va_to_pg(addr);
++			set_page_private(page, i);
++			page->index = pgoff_start;
++			__SetPageBuddy(page);
++			list_add((struct list_head *)&page->zone_device_data,
++				 &ns->free_area[i]);
++
++			pgoff_start += 1L << i;
++			pages -= 1L << i;
++		}
++	}
+ }
+ 
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	char buf[BDEVNAME_SIZE];
+ 	struct block_device *bdev;
+ 	pgoff_t pgoff;
+-	int id, err;
++	int id, i, err;
+ 	char *path;
+ 	long dax_ret = 0;
+ 
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 
+ 	mutex_init(&ns->lock);
+ 
++	/*
++	 * parameters of bitmap_set/clear are unsigned int.
++	 * Given currently size of nvm is far from exceeding this limit,
++	 * so only add a WARN_ON message.
++	 */
++	WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++	ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++				    sizeof(unsigned long), GFP_KERNEL);
++	if (!ns->pages_bitmap) {
++		err = -ENOMEM;
++		goto clear_ns_nr;
++	}
++
++	if (ns->sb->this_ns == 0) {
++		ns->recs_bitmap =
++			bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++		if (ns->recs_bitmap == NULL) {
++			err = -ENOMEM;
++			goto free_pages_bitmap;
++		}
++	}
++
++	for (i = 0; i < BCH_MAX_ORDER; i++)
++		INIT_LIST_HEAD(&ns->free_area[i]);
++
+ 	err = init_nvmpg_set_header(ns);
+ 	if (err < 0)
+-		goto free_ns;
++		goto free_recs_bitmap;
++
++	if (ns->sb->this_ns == 0)
++		/* init buddy allocator */
++		bch_nvmpg_init_free_space(ns);
+ 
+ 	kfree(path);
+ 	return ns;
+ 
++free_recs_bitmap:
++	bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++	kvfree(ns->pages_bitmap);
++clear_ns_nr:
++	global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ 	fs_put_dax(ns->dax_dev);
+ 	kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+  * Bcache NVDIMM in memory data structures
+  */
+ 
++#define BCH_MAX_ORDER 20
++
+ /*
+  * The following three structures in memory records which page(s) allocated
+  * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ 	unsigned long pages_total;
+ 	pfn_t start_pfn;
+ 
++	unsigned long *pages_bitmap;
++	struct list_head free_area[BCH_MAX_ORDER];
++
++	unsigned long *recs_bitmap;
++
+ 	struct dax_device *dax_dev;
+ 	struct block_device *bdev;
+ 	struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_MAX_PGALLOC_RECS						\
++	(min_t(unsigned int, 64,					\
++	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
++	       sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..9667099
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,309 @@
+From 8d0370253021430d3e59b084ce242a32410a51c0 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH 04/13] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 222 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h |   9 ++
+ 2 files changed, 231 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index b51073588f65..8c0e827a98cd 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ 	return NULL;
+ }
+ 
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++        return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ 	int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ 	return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+ 
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++					       pgoff_t pgoff)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = pgoff << PAGE_SHIFT;
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ 	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,214 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 	}
+ }
+ 
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++	struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++	struct bch_nvmpg_head *head = NULL;
++	int i;
++
++	if (set_header == NULL)
++		goto out;
++
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *h = &set_header->heads[i];
++
++		if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++			continue;
++
++		if (!memcmp(uuid, h->uuid, 16)) {
++			head = h;
++			break;
++		}
++	}
++
++	if (!head && create) {
++		u32 used = set_header->used;
++
++		if (set_header->size > used) {
++			head = &set_header->heads[used];
++			memset(head, 0, sizeof(struct bch_nvmpg_head));
++			head->state = BCH_NVMPG_HD_STAT_ALLOC;
++			memcpy(head->uuid, uuid, 16);
++			global_nvmpg_set->heads_used++;
++			set_header->used++;
++		} else
++			pr_info("No free bch_nvmpg_head\n");
++	}
++
++out:
++	return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++	unsigned int start;
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++	struct bch_nvmpg_recs *recs;
++
++	start = bitmap_find_next_zero_area(ns->recs_bitmap,
++					   BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++	if (start > BCH_MAX_PGALLOC_RECS) {
++		pr_info("No free struct bch_nvmpg_recs\n");
++		return NULL;
++	}
++
++	bitmap_set(ns->recs_bitmap, start, 1);
++	recs = (struct bch_nvmpg_recs *)
++		bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++	       + start;
++
++	memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++	return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++					      struct bch_nvmpg_head *head,
++					      bool create)
++{
++	int ns_id = ns->sb->this_ns;
++	struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++	recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++	/* If create=false, we return recs[nr] */
++	if (!create)
++		return recs;
++
++	/*
++	 * If create=true, it mean we need a empty struct bch_nvmpg_rec
++	 * So we should find non-empty struct bch_nvmpg_recs or alloc
++	 * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++	 */
++	while (recs && (recs->used == recs->size)) {
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++
++	/* Found empty struct bch_nvmpg_recs */
++	if (recs)
++		return recs;
++
++	/* Need alloc new struct bch_nvmpg_recs */
++	recs = find_empty_nvmpg_recs();
++	if (recs) {
++		unsigned long offset;
++
++		recs->next_offset = 0;
++		recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++		memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++		memcpy(recs->uuid, head->uuid, 16);
++		recs->size = BCH_NVMPG_MAX_RECS;
++		recs->used = 0;
++
++		offset = bch_nvmpg_ptr_to_offset(ns, recs);
++		if (prev_recs)
++			prev_recs->next_offset = offset;
++		else
++			head->recs_offset[ns_id] = offset;
++	}
++
++	return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++			  struct bch_nvmpg_recs *recs,
++			  unsigned long nvmpg_offset,
++			  int order)
++{
++	int i, ns_id;
++	unsigned long pgoff;
++
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++	ns_id = ns->sb->this_ns;
++
++	for (i = 0; i < recs->size; i++) {
++		if (recs->recs[i].pgoff == 0) {
++			recs->recs[i].pgoff = pgoff;
++			recs->recs[i].order = order;
++			recs->recs[i].ns_id = ns_id;
++			recs->used++;
++			break;
++		}
++	}
++	BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	unsigned long nvmpg_offset = 0;
++	struct bch_nvmpg_head *head;
++	int n, o;
++
++	mutex_lock(&global_nvmpg_set->lock);
++	head = find_nvmpg_head(uuid, true);
++
++	if (!head) {
++		pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++		goto unlock;
++	}
++
++	for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++		struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++		if (!ns || (ns->free < (1L << order)))
++			continue;
++
++		for (o = order; o < BCH_MAX_ORDER; o++) {
++			struct list_head *list;
++			struct page *page, *buddy_page;
++
++			if (list_empty(&ns->free_area[o]))
++				continue;
++
++			list = ns->free_area[o].next;
++			page = container_of((void *)list, struct page,
++					    zone_device_data);
++
++			list_del(list);
++
++			while (o != order) {
++				void *addr;
++				pgoff_t pgoff;
++
++				pgoff = page->index + (1L << (o - 1));
++				addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++				buddy_page = bch_nvmpg_va_to_pg(addr);
++				set_page_private(buddy_page, o - 1);
++				buddy_page->index = pgoff;
++				__SetPageBuddy(buddy_page);
++				list_add((struct list_head *)&buddy_page->zone_device_data,
++					 &ns->free_area[o - 1]);
++				o--;
++			}
++
++			set_page_private(page, order);
++			__ClearPageBuddy(page);
++			ns->free -= 1L << order;
++			nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++			break;
++		}
++
++		if (o < BCH_MAX_ORDER) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = find_nvmpg_recs(ns, head, true);
++			/* ToDo: handle pgalloc_recs==NULL */
++			add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++			break;
++		}
++	}
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return nvmpg_offset;
++}
++EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_PGOFF_TO_KVADDR(pgoff)					\
++	((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS						\
+ 	(min_t(unsigned int, 64,					\
+ 	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ 
+ #else
+ 
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+ 
+ static inline void bch_nvmpg_exit(void) { }
+ 
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
new file mode 100644
index 0000000..0f8454f
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
@@ -0,0 +1,252 @@
+From f0165caac63639c6bbc9bfa2182500ecebdb6bf9 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH 05/13] bcache: bch_nvmpg_free_pages() of the buddy allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 165 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h |   3 +
+ 2 files changed, 161 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 8c0e827a98cd..7b86f08c219a 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	return rc;
+ }
+ 
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++			 int order)
++{
++	unsigned long add_pages = (1L << order);
++	pgoff_t pgoff;
++	struct page *page;
++	void *va;
++
++	if (nvmpg_offset == 0) {
++		pr_err("free pages on offset 0\n");
++		return;
++	}
++
++	page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++	WARN_ON((!page) || (page->private != order));
++	pgoff = page->index;
++
++	while (order < BCH_MAX_ORDER - 1) {
++		struct page *buddy_page;
++
++		pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++		pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++		if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++			break;
++
++		va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++		buddy_page = bch_nvmpg_va_to_pg(va);
++		WARN_ON(!buddy_page);
++
++		if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++			list_del((struct list_head *)&buddy_page->zone_device_data);
++			__ClearPageBuddy(buddy_page);
++			pgoff = parent_pgoff;
++			order++;
++			continue;
++		}
++		break;
++	}
++
++	va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++	page = bch_nvmpg_va_to_pg(va);
++	WARN_ON(!page);
++	list_add((struct list_head *)&page->zone_device_data,
++		 &ns->free_area[order]);
++	page->index = pgoff;
++	set_page_private(page, order);
++	__SetPageBuddy(page);
++	ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ 	unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 		pages = end - start;
+ 
+ 		while (pages) {
+-			void *addr;
+-
+ 			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ 				if ((pgoff_start % (1L << i) == 0) &&
+ 				    (pages >= (1L << i)))
+ 					break;
+ 			}
+ 
+-			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+-			page = bch_nvmpg_va_to_pg(addr);
++			page = bch_nvmpg_va_to_pg(
++					bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ 			set_page_private(page, i);
+ 			page->index = pgoff_start;
+-			__SetPageBuddy(page);
+-			list_add((struct list_head *)&page->zone_device_data,
+-				 &ns->free_area[i]);
++
++			/* In order to update ns->free */
++			__free_space(ns, pgoff_start, i);
+ 
+ 			pgoff_start += 1L << i;
+ 			pages -= 1L << i;
+@@ -491,6 +540,107 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ }
+ EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
+ 
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++				     void *start_addr, void *end_addr)
++{
++	return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++			    unsigned long nvmpg_offset, int order)
++{
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *prev_recs, *sys_recs;
++	struct bch_nvmpg_ns *ns;
++	unsigned long pgoff;
++	int i;
++
++	ns = global_nvmpg_set->ns_tbl[0];
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++	head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++	prev_recs = recs;
++	sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++	while (recs) {
++		for (i = 0; i < recs->size; i++) {
++			struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++			if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++				WARN_ON(rec->order != order);
++				rec->_v = 0;
++				recs->used--;
++
++				if (recs->used == 0) {
++					int recs_pos = recs - sys_recs;
++
++					if (recs == prev_recs)
++						head->recs_offset[ns_id] =
++							recs->next_offset;
++					else
++						prev_recs->next_offset =
++							recs->next_offset;
++
++					recs->next_offset = 0;
++					recs->head_offset = 0;
++
++					bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++				}
++				goto out;
++			}
++		}
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++out:
++	return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++			  const char *uuid)
++{
++	struct bch_nvmpg_ns *ns;
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *recs;
++	int r;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++	if (!ns) {
++		pr_err("can't find namespace by given kaddr from namespace\n");
++		goto unlock;
++	}
++
++	head = find_nvmpg_head(uuid, false);
++	if (!head) {
++		pr_err("can't found bch_nvmpg_head by uuid\n");
++		goto unlock;
++	}
++
++	recs = find_nvmpg_recs(ns, head, false);
++	if (!recs) {
++		pr_err("can't find bch_nvmpg_recs by uuid\n");
++		goto unlock;
++	}
++
++	r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++	if (r < 0) {
++		pr_err("can't find bch_nvmpg_rec\n");
++		goto unlock;
++	}
++
++	__free_space(ns, nvmpg_offset, order);
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++}
++EXPORT_SYMBOL_GPL(bch_nvmpg_free_pages);
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+@@ -687,6 +837,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	ns->pages_offset = sb->pages_offset;
+ 	ns->pages_total = sb->pages_total;
+ 	ns->sb = sb;
++	/* increase by __free_space() */
+ 	ns->free = 0;
+ 	ns->bdev = bdev;
+ 	ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ 
+ #else
+ 
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return 0;
+ }
+ 
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
new file mode 100644
index 0000000..9195841
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
@@ -0,0 +1,67 @@
+From 10a097e1408174b0fe3f029c37d7d512662a4582 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH 06/13] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 6 ++++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 7b86f08c219a..e4642e591f23 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -540,6 +540,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ }
+ EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages);
+ 
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return find_nvmpg_head(uuid, false);
++}
++EXPORT_SYMBOL_GPL(bch_get_nvmpg_head);
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ 	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+ 
+ #else
+ 
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+ 
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
new file mode 100644
index 0000000..f240531
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
@@ -0,0 +1,48 @@
+From 1faf072bef28470d4d90e6ec5c42981b4b881ec0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH 07/13] bcache: use bucket index to set GC_MARK_METADATA for
+ journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 93b67b8d31c3..f7f844c321c3 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ 	ca = c->cache;
+ 	ca->invalidate_needs_gc = 0;
+ 
+-	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+-		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++	/* Range [first_bucket, first_bucket + keys) is for journal buckets */
++	for (i = ca->sb.first_bucket;
++	     i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++		SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+ 
+ 	for (k = ca->prio_buckets;
+ 	     k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
new file mode 100644
index 0000000..794e12a
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
@@ -0,0 +1,60 @@
+From 497259154b1f79bfdaf967b21109521b301af534 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH 08/13] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET		0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE	0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META		0x0004
+ 
+ #define BCH_FEATURE_COMPAT_SUPP		0
+ #define BCH_FEATURE_RO_COMPAT_SUPP	0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++					 BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ 					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+ 
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ 		((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+ 
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+ 
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
new file mode 100644
index 0000000..c8020e4
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
@@ -0,0 +1,255 @@
+From a0220c3b0138d021975ef1d5e29e07217626ff9e Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH 09/13] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+  list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+  from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h |   2 +-
+ drivers/md/bcache/nvmpg.c   |   9 +++
+ drivers/md/bcache/nvmpg.h   |   1 +
+ drivers/md/bcache/super.c   |  18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+ 
+ #include <trace/events/bcache.h>
+ 
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+ 
+ 	return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++					     struct cache *ca)
++{
++	unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++	unsigned long ret_offset = 0;
++	int i;
++
++	jnl_offset = (unsigned long)ca->sb.d[0];
++	jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++	jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		struct bch_nvmpg_recs *recs;
++		struct bch_nvmpg_rec *rec;
++		unsigned long recs_offset = 0;
++		int j;
++
++		recs_offset = nvmpg_head->recs_offset[i];
++		recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		while (recs) {
++			for (j = 0; j < recs->size; j++) {
++				rec = &recs->recs[j];
++				if ((rec->pgoff != jnl_pgoff) ||
++				    (rec->ns_id != jnl_ns_id))
++					continue;
++
++				ret_offset = jnl_offset;
++				goto out;
++			}
++			recs_offset = recs->next_offset;
++			recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		}
++	}
++
++out:
++	return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++	struct bch_nvmpg_head *head = NULL;
++	unsigned long nvmpg_offset;
++	int order;
++
++	head = bch_get_nvmpg_head(ca->sb.set_uuid);
++	if (head) {
++		nvmpg_offset = find_journal_nvmpg_base(head, ca);
++		if (nvmpg_offset)
++			goto found;
++	}
++
++	order = ilog2((ca->sb.bucket_size *
++		       ca->sb.njournal_buckets) / PAGE_SECTORS);
++	nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++	if (nvmpg_offset)
++		memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++		       0, (1 << order) * PAGE_SIZE);
++found:
++	return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++	int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	int i;
++	unsigned long jnl_base = 0;
++
++	jnl_base = get_journal_nvmpg_space(ca);
++	if (!jnl_base) {
++		pr_err("Failed to get journal space from nvdimm\n");
++		goto out;
++	}
++
++	/* Iniialized and reloaded from on-disk super block already */
++	if (ca->sb.d[0] != 0)
++		goto out;
++
++	for (i = 0; i < ca->sb.keys; i++)
++		ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++	ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++	return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++	int i, ret = 0;
++	struct cache *ca = c->cache;
++
++	ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++			      2, SB_JOURNAL_BUCKETS);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++		for (i = 0; i < ca->sb.keys; i++)
++			ca->sb.d[i] = ca->sb.first_bucket + i;
++	} else
++		ret = __bch_journal_nvdimm_init(ca);
++
++	return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+ 
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index e4642e591f23..142ad41e9c15 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+ 
+ struct bch_nvmpg_set *global_nvmpg_set;
+ 
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++	if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++		return global_nvmpg_set->ns_tbl[ns_id];
++
++	pr_emerg("Invalid ns_id: %d\n", ns_id);
++	return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ 	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+ 
+ #else
+ 
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 841d08e50191..990d5d6fe199 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb,  struct block_device *
+ 		goto err;
+ 
+ 	err = "Journal buckets not sequential";
+-	for (i = 0; i < sb->keys; i++)
+-		if (sb->d[i] != sb->first_bucket + i)
+-			goto err;
++	if (!bch_has_feature_nvdimm_meta(sb)) {
++		for (i = 0; i < sb->keys; i++)
++			if (sb->d[i] != sb->first_bucket + i)
++				goto err;
++	}
+ 
+ 	err = "Too many journal buckets";
+ 	if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2061,14 +2063,10 @@ static int run_cache_set(struct cache_set *c)
+ 		if (bch_journal_replay(c, &journal))
+ 			goto err;
+ 	} else {
+-		unsigned int j;
+-
+ 		pr_notice("invalidating existing data\n");
+-		ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+-					2, SB_JOURNAL_BUCKETS);
+-
+-		for (j = 0; j < ca->sb.keys; j++)
+-			ca->sb.d[j] = ca->sb.first_bucket + j;
++		err = "error initializing journal";
++		if (bch_journal_init(c))
++			goto err;
+ 
+ 		bch_initial_gc_finish(c);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
new file mode 100644
index 0000000..6e105c6
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
@@ -0,0 +1,231 @@
+From a86e90383059c6d2a6972931127180b1fa174fbb Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH 10/13] bcache: support storing bcache journal into NVDIMM meta
+ device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+   The latency method to compose bio and issue the jset bio to cache
+   device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+   device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+   Use memory interface to access NVDIMM pages and store the jset by
+   memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+   address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c   |   3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ 		return;
+ 	}
+ 
++	BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ 	switch (atomic_read(&ja->discard_in_flight)) {
+ 	case DISCARD_IN_FLIGHT:
+ 		return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ 		goto out;
+ 
+ 	ja->cur_idx = next;
+-	k->ptr[0] = MAKE_PTR(0,
+-			     bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+-			     ca->sb.nr_this_dev);
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		k->ptr[0] = MAKE_PTR(0,
++			bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++			ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++						ca->sb.d[ja->cur_idx]);
++#endif
++
+ 	atomic_long_inc(&c->reclaimed_journal_buckets);
+ 
+ 	bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ 	spin_unlock(&c->journal.lock);
+ }
+ 
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ 	__releases(c->journal.lock)
+ {
+-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+-	struct cache *ca = c->cache;
+-	struct journal_write *w = c->journal.cur;
+ 	struct bkey *k = &c->journal.key;
+-	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+-		ca->sb.block_size;
+-
++	struct journal_write *w = c->journal.cur;
++	struct closure *cl = &c->journal.io;
++	struct cache *ca = c->cache;
+ 	struct bio *bio;
+ 	struct bio_list list;
++	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++		ca->sb.block_size;
+ 
+ 	bio_list_init(&list);
+ 
+-	if (!w->need_write) {
+-		closure_return_with_destructor(cl, journal_write_unlock);
+-		return;
+-	} else if (journal_full(&c->journal)) {
+-		journal_reclaim(c);
+-		spin_unlock(&c->journal.lock);
+-
+-		btree_flush_write(c);
+-		continue_at(cl, journal_write, bch_journal_wq);
+-		return;
+-	}
+-
+-	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+-	w->data->btree_level = c->root->level;
+-
+-	bkey_copy(&w->data->btree_root, &c->root->key);
+-	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+-	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+-	w->data->magic		= jset_magic(&ca->sb);
+-	w->data->version	= BCACHE_JSET_VERSION;
+-	w->data->last_seq	= last_seq(&c->journal);
+-	w->data->csum		= csum_set(w->data);
+-
+ 	for (i = 0; i < KEY_PTRS(k); i++) {
+ 		ca = c->cache;
+ 		bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ 	}
+-
+ 	/* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ 	BUG_ON(i == 0);
+ 
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 	while ((bio = bio_list_pop(&list)))
+ 		closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++	__releases(c->journal.lock)
++{
++	struct journal_write *w = c->journal.cur;
++	struct cache *ca = c->cache;
++	unsigned int sectors;
++
++	sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++	atomic_long_add(sectors, &ca->meta_sectors_written);
++
++	memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++	c->journal.key.ptr[0] += sectors << 9;
++	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++	atomic_dec_bug(&fifo_back(&c->journal.pin));
++	bch_journal_next(&c->journal);
++	journal_reclaim(c);
++
++	spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++	struct cache *ca = c->cache;
++	struct journal_write *w = c->journal.cur;
++
++	if (!w->need_write) {
++		closure_return_with_destructor(cl, journal_write_unlock);
++		return;
++	} else if (journal_full(&c->journal)) {
++		journal_reclaim(c);
++		spin_unlock(&c->journal.lock);
++
++		btree_flush_write(c);
++		continue_at(cl, journal_write, bch_journal_wq);
++		return;
++	}
++
++	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++	w->data->btree_level = c->root->level;
++
++	bkey_copy(&w->data->btree_root, &c->root->key);
++	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++	w->data->magic		= jset_magic(&ca->sb);
++	w->data->version	= BCACHE_JSET_VERSION;
++	w->data->last_seq	= last_seq(&c->journal);
++	w->data->csum		= csum_set(w->data);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		__journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		__journal_nvdimm_write_unlocked(c);
++#endif
+ 
+ 	continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 990d5d6fe199..42fd99406c60 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1672,7 +1672,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ 	struct cache_set *c = container_of(cl, struct cache_set, cl);
+-	struct cache *ca;
++	struct cache *ca = c->cache;
+ 
+ 	debugfs_remove(c->debug);
+ 
+@@ -1684,7 +1684,6 @@ static void cache_set_free(struct closure *cl)
+ 	bch_bset_sort_state_free(&c->sort);
+ 	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+ 
+-	ca = c->cache;
+ 	if (ca) {
+ 		ca->set = NULL;
+ 		c->cache = NULL;
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
new file mode 100644
index 0000000..49ed5be
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
@@ -0,0 +1,181 @@
+From 29b95828f4804806bac44a795cba09ddc0cc0da0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH 11/13] bcache: read jset from NVDIMM pages for journal replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+  This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+  This is the method to read jset from NVDIMM memory interface, a.k.a
+  memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++				    unsigned int len, unsigned int offset,
++				    struct closure *cl)
++{
++	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++	struct bio *bio = &ca->journal.bio;
++	struct jset *data = ca->set->journal.w[0].data;
++
++	bio_reset(bio);
++	bio->bi_iter.bi_sector	= bucket + offset;
++	bio_set_dev(bio, ca->bdev);
++	bio->bi_iter.bi_size	= len << 9;
++
++	bio->bi_end_io	= journal_read_endio;
++	bio->bi_private = cl;
++	bio_set_op_attrs(bio, REQ_OP_READ, 0);
++	bch_bio_map(bio, data);
++
++	closure_bio_submit(ca->set, bio, cl);
++	closure_sync(cl);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++				     unsigned int len, unsigned int offset)
++{
++	void *jset_addr;
++	struct jset *data;
++
++	jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++	data = ca->set->journal.w[0].data;
++
++	memcpy(data, jset_addr, len << 9);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ 			       unsigned int bucket_index)
+ {
+ 	struct journal_device *ja = &ca->journal;
+-	struct bio *bio = &ja->bio;
+ 
+ 	struct journal_replay *i;
+-	struct jset *j, *data = ca->set->journal.w[0].data;
++	struct jset *j;
+ 	struct closure cl;
+ 	unsigned int len, left, offset = 0;
+ 	int ret = 0;
+-	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+ 
+ 	closure_init_stack(&cl);
+ 
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread:		left = ca->sb.bucket_size - offset;
+ 		len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+ 
+-		bio_reset(bio);
+-		bio->bi_iter.bi_sector	= bucket + offset;
+-		bio_set_dev(bio, ca->bdev);
+-		bio->bi_iter.bi_size	= len << 9;
+-
+-		bio->bi_end_io	= journal_read_endio;
+-		bio->bi_private = &cl;
+-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+-		bch_bio_map(bio, data);
+-
+-		closure_bio_submit(ca->set, bio, &cl);
+-		closure_sync(&cl);
++		if (!bch_has_feature_nvdimm_meta(&ca->sb))
++			j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++		/*
++		 * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++		 * support feature set, a cache device format with feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++		 * read_super() by bch_has_unknown_incompat_features().
++		 * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++		 * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++		 */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		else
++			j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+ 
+ 		/* This function could be simpler now since we no longer write
+ 		 * journal entries that overlap bucket boundaries; this means
+ 		 * the start of a bucket will always have a valid journal entry
+ 		 * if it has any journal entries at all.
+ 		 */
+-
+-		j = data;
+ 		while (len) {
+ 			struct list_head *where;
+ 			size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread:		left = ca->sb.bucket_size - offset;
+ 	return ret;
+ }
+ 
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b)							\
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ 	unsigned int i, l, r, m;
+ 	uint64_t seq;
+ 
++	/*
++	 * Linear addresses of NVDIMM pages for journaling is not
++	 * initialized yet, do it before read jset from NVDIMM pages.
++	 */
++	if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++		if (__bch_journal_nvdimm_init(ca) < 0)
++			return -ENXIO;
++	}
++
+ 	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ 	pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
new file mode 100644
index 0000000..e35c696
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
@@ -0,0 +1,84 @@
+From 286f425617ba71c2ff30930d010e0808dc41d953 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH 12/13] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+  echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 42fd99406c60..723791250070 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2398,10 +2398,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ 					 struct kobj_attribute *attr,
+ 					 const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++				    struct kobj_attribute *attr,
++				    const char *buffer, size_t size);
++#endif
+ 
+ kobj_attribute_write(register,		register_bcache);
+ kobj_attribute_write(register_quiet,	register_bcache);
+ kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+ 
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2515,6 +2523,24 @@ static void register_device_async(struct async_reg_args *args)
+ 	queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+ 
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++				    const char *buffer, size_t size)
++{
++	ssize_t ret = size;
++
++	struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++	if (IS_ERR(ns)) {
++		pr_err("register nvdimm namespace %s for meta device failed.\n",
++			buffer);
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ 			       const char *buffer, size_t size)
+ {
+@@ -2857,6 +2883,9 @@ static int __init bcache_init(void)
+ 	static const struct attribute *files[] = {
+ 		&ksysfs_register.attr,
+ 		&ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		&ksysfs_register_nvdimm_meta.attr,
++#endif
+ 		&ksysfs_pendings_cleanup.attr,
+ 		NULL
+ 	};
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch
new file mode 100644
index 0000000..18fdf37
--- /dev/null
+++ b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch
@@ -0,0 +1,489 @@
+From b0344cea65a7c816dbad1d4684a96dca929d8344 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 22:54:20 +0800
+Subject: [PATCH 13/13] store btree node in nvdimm
+
+---
+ drivers/md/bcache/alloc.c         |  67 +++++++++++++++---
+ drivers/md/bcache/bcache.h        |   3 +-
+ drivers/md/bcache/bcache_ondisk.h |   2 +-
+ drivers/md/bcache/btree.c         | 114 ++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.c         |  50 +++++++++++++
+ drivers/md/bcache/nvmpg.h         |  52 ++++++++++++++
+ drivers/md/bcache/super.c         |   3 +-
+ 7 files changed, 273 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 097577ae3c47..9bdd6ee9e886 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -63,6 +63,7 @@
+ 
+ #include "bcache.h"
+ #include "btree.h"
++#include "nvmpg.h"
+ 
+ #include <linux/blkdev.h>
+ #include <linux/kthread.h>
+@@ -477,12 +478,28 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
+ 	}
+ }
+ 
++void __bch_nvmpg_bucket_free(struct cache_set *c, struct bkey *k)
++{
++	int order;
++	unsigned long nvmpg_offset;
++
++	order = ilog2(c->cache->sb.bucket_size / PAGE_SECTORS);
++	nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(k, 0));
++	bch_nvmpg_free_pages(nvmpg_offset, order, c->set_uuid);
++}
++
+ void bch_bucket_free(struct cache_set *c, struct bkey *k)
+ {
+ 	unsigned int i;
+ 
++	if (KEY_NVMPG(k)) {
++		__bch_nvmpg_bucket_free(c, k);
++		return;
++	}
++
+ 	for (i = 0; i < KEY_PTRS(k); i++)
+ 		__bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
++	return;
+ }
+ 
+ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+@@ -517,15 +534,31 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+ 	return -1;
+ }
+ 
+-int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+-			 struct bkey *k, bool wait)
++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k)
+ {
+-	int ret;
++	struct cache *ca;
++	unsigned long nvmpg_offset, bkey_offset;
++	int order;
+ 
+-	mutex_lock(&c->bucket_lock);
+-	ret = __bch_bucket_alloc_set(c, reserve, k, wait);
+-	mutex_unlock(&c->bucket_lock);
+-	return ret;
++	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
++		return -1;
++
++	lockdep_assert_held(&c->bucket_lock);
++
++	order = ilog2(ca->sb.bucket_size / PAGE_SECTORS);
++	nvmpg_offset = bch_nvmpg_alloc_pages(order, c->set_uuid);
++	if (!nvmpg_offset)
++		goto err;
++
++	bkey_offset = nvmpg_offset_to_bkey_offset(nvmpg_offset);
++
++	bkey_init(k);
++	k->ptr[0] = MAKE_PTR(0, bkey_offset, ca->sb.nr_this_dev);
++
++	SET_KEY_PTRS(k, 1);
++	return 0;
++err:
++	return -1;
+ }
+ 
+ /* Sector allocator */
+@@ -537,6 +570,23 @@ struct open_bucket {
+ 	BKEY_PADDED(key);
+ };
+ 
++int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
++			 struct bkey *k, bool wait, int bucket_type)
++{
++	int ret;
++
++	if (bucket_type == BCH_DATA_BUCKET) {
++		mutex_lock(&c->bucket_lock);
++		ret = __bch_bucket_alloc_set(c, reserve, k, wait);
++		mutex_unlock(&c->bucket_lock);
++	} else {
++		ret = __bch_nvmpg_bucket_alloc(c, k);
++	}
++
++	return ret;
++}
++
++
+ /*
+  * We keep multiple buckets open for writes, and try to segregate different
+  * write streams for better cache utilization: first we try to segregate flash
+@@ -631,7 +681,8 @@ bool bch_alloc_sectors(struct cache_set *c,
+ 
+ 		spin_unlock(&c->data_bucket_lock);
+ 
+-		if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
++		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
++					 wait, BCH_DATA_BUCKET))
+ 			return false;
+ 
+ 		spin_lock(&c->data_bucket_lock);
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 9ed9c955add7..d54c3c3d8d7e 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -979,11 +979,12 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
+ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+ 			   struct bkey *k, bool wait);
+ int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
+-			 struct bkey *k, bool wait);
++			 struct bkey *k, bool wait, int bucket_type);
+ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
+ 		       unsigned int sectors, unsigned int write_point,
+ 		       unsigned int write_prio, bool wait);
+ bool bch_cached_dev_error(struct cached_dev *dc);
++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k);
+ 
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
+diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
+index 97413586195b..6c890f632197 100644
+--- a/drivers/md/bcache/bcache_ondisk.h
++++ b/drivers/md/bcache/bcache_ondisk.h
+@@ -45,7 +45,7 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v)	\
+ KEY_FIELD(KEY_PTRS,	high, 60, 3)
+ KEY_FIELD(__PAD0,	high, 58, 2)
+ KEY_FIELD(KEY_CSUM,	high, 56, 2)
+-KEY_FIELD(__PAD1,	high, 55, 1)
++KEY_FIELD(KEY_NVMPG,	high, 55, 1)
+ KEY_FIELD(KEY_DIRTY,	high, 36, 1)
+ 
+ KEY_FIELD(KEY_SIZE,	high, 20, KEY_SIZE_BITS)
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index f7f844c321c3..b8854905b93e 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -25,6 +25,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "features.h"
++#include "nvmpg.h"
+ 
+ #include <linux/slab.h>
+ #include <linux/bitops.h>
+@@ -240,14 +242,12 @@ static void btree_node_read_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
+-static void bch_btree_node_read(struct btree *b)
++static void __bch_btree_node_read(struct btree *b)
+ {
+ 	uint64_t start_time = local_clock();
+ 	struct closure cl;
+ 	struct bio *bio;
+ 
+-	trace_bcache_btree_read(b);
+-
+ 	closure_init_stack(&cl);
+ 
+ 	bio = bch_bbio_alloc(b->c);
+@@ -278,6 +278,35 @@ static void bch_btree_node_read(struct btree *b)
+ 			    PTR_BUCKET_NR(b->c, &b->key, 0));
+ }
+ 
++static void __bch_nvmpg_btree_node_read(struct btree *b)
++{
++	uint64_t start_time = local_clock();
++	void *ptr;
++
++	ptr = bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0));
++	memcpy(b->keys.set[0].data, ptr, KEY_SIZE(&b->key) << 9);
++
++	if (btree_node_io_error(b))
++		goto err;
++
++	bch_btree_node_read_done(b);
++	bch_time_stats_update(&b->c->btree_read_time, start_time);
++
++err:
++	bch_cache_set_error(b->c,
++		"io error reading NVDIMM pages at 0x%p\n", ptr);
++}
++
++static void bch_btree_node_read(struct btree *b)
++{
++	trace_bcache_btree_read(b);
++
++	if (!KEY_NVMPG(&b->key))
++		__bch_btree_node_read(b);
++	else
++		__bch_nvmpg_btree_node_read(b);
++}
++
+ static void btree_complete_write(struct btree *b, struct btree_write *w)
+ {
+ 	if (w->prio_blocked &&
+@@ -335,7 +364,7 @@ static void btree_node_write_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
+-static void do_btree_node_write(struct btree *b)
++static void __do_btree_node_write(struct btree *b)
+ {
+ 	struct closure *cl = &b->io;
+ 	struct bset *i = btree_bset_last(b);
+@@ -400,6 +429,68 @@ static void do_btree_node_write(struct btree *b)
+ 	}
+ }
+ 
++static void btree_nvmpg_complete_write(struct btree *b, struct btree_write *w)
++{
++	atomic_sub(w->prio_blocked, &b->c->prio_blocked);
++
++	if (w->journal) {
++		atomic_dec_bug(w->journal);
++		__closure_wake_up(&b->c->journal.wait);
++	}
++
++	w->prio_blocked = 0;
++	w->journal	= NULL;
++}
++
++static void btree_nvmpg_node_write_done(struct closure *cl)
++{
++	struct btree *b = container_of(cl, struct btree, io);
++	struct btree_write *w = btree_prev_write(b);
++
++	btree_nvmpg_complete_write(b, w);
++
++	if (btree_node_dirty(b))
++		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
++
++	closure_return_with_destructor(cl, btree_node_write_unlock);
++}
++
++static void __do_nvmpg_btree_node_write(struct btree *b)
++{
++	struct closure *cl = &b->io;
++	struct bset *i = btree_bset_last(b);
++	unsigned long nvmpg_offset;
++	void *nvmpg_ptr;
++
++	i->version	= BCACHE_BSET_VERSION;
++	i->csum		= btree_csum_set(b, i);
++
++	BUG_ON(b->bio);
++
++	/* Calculate location to write */
++	nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(&b->key, 0));
++	nvmpg_offset += roundup(set_bytes(i), block_bytes(b->c->cache));
++	nvmpg_ptr = bch_nvmpg_offset_to_ptr(nvmpg_offset);
++
++	memcpy_flushcache(nvmpg_ptr, i,
++			roundup(set_bytes(i), block_bytes(b->c->cache)) << 9);
++
++	/* Update b->key to the wriitten location */
++	SET_PTR_OFFSET(&b->key, 0,
++		       nvmpg_offset_to_bkey_offset(nvmpg_offset));
++
++	closure_sync(cl);
++	continue_at_nobarrier(cl, btree_nvmpg_node_write_done, NULL);
++}
++
++static void do_btree_node_write(struct btree *b)
++{
++	if (!KEY_NVMPG(&b->key))
++		__do_btree_node_write(b);
++	else
++		__do_nvmpg_btree_node_write(b);
++}
++
+ void __bch_btree_node_write(struct btree *b, struct closure *parent)
+ {
+ 	struct bset *i = btree_bset_last(b);
+@@ -1094,10 +1185,19 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ 
+ 	mutex_lock(&c->bucket_lock);
+ retry:
+-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
++	/*
++	 * If nvdimm_meta feature is enabled, try to allocate btree
++	 * node from NVDIMM pages and set KEY_NVMPG bit successfully.
++	 */
++	if (bch_has_feature_nvdimm_meta(&(c->cache->sb)))
++		__bch_nvmpg_bucket_alloc(c, &k.key);
++
++	if (!KEY_NVMPG(&k.key) &&
++	    __bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
+ 		goto err;
+ 
+-	bkey_put(c, &k.key);
++	if (!KEY_NVMPG(&k.key))
++		bkey_put(c, &k.key);
+ 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+ 
+ 	b = mca_alloc(c, op, &k.key, level);
+@@ -1118,7 +1218,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ 	trace_bcache_btree_node_alloc(b);
+ 	return b;
+ err_free:
+-	bch_bucket_free(c, &k.key);
++		bch_bucket_free(c, &k.key);
+ err:
+ 	mutex_unlock(&c->bucket_lock);
+ 
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 142ad41e9c15..12d67e535854 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -91,6 +91,56 @@ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ 	return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
+ }
+ 
++static void bug_on_bkey_offset_limit(unsigned long sector)
++{
++	if (sector >= ((1UL << BCH_BKEY_OFFSET_BITS) - 1)) {
++		pr_err("Invalid NVDIMM offset: too large as 0x%lx\n",
++		       sector);
++		pr_err("Such condition should never happen. Panic.\n");
++		BUG();
++	}
++}
++
++int bkey_offset_to_nvmpg_ns_id(unsigned long bkey_offset)
++{
++	return (bkey_offset >> BCH_BKEY_OFFSET_BITS) &
++		BCH_BKEY_OFFSET_NS_ID_MASK;
++}
++
++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset)
++{
++	int ns_id;
++	unsigned long offset;
++
++	ns_id = (bkey_offset >> BCH_BKEY_OFFSET_BITS) &
++		BCH_BKEY_OFFSET_NS_ID_MASK;
++
++	offset = (bkey_offset & BCH_BKEY_OFFSET_MASK) << 9;
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset)
++{
++	int ns_id;
++	unsigned long sector;
++
++	ns_id = BCH_NVMPG_GET_NS_ID(nvmpg_offset);
++	sector = BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> 9;
++	bug_on_bkey_offset_limit(sector);
++
++	return ((sector & BCH_BKEY_OFFSET_MASK) |
++		((ns_id & BCH_BKEY_OFFSET_NS_ID_MASK) << BCH_BKEY_OFFSET_BITS));
++}
++
++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset)
++{
++	unsigned long nvmpg_offset;
++
++	nvmpg_offset = bkey_offset_to_nvmpg_offset(bkey_offset);
++	return bch_nvmpg_offset_to_ptr(nvmpg_offset);
++}
++
+ static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
+ 				       pgoff_t pgoff, u64 nr)
+ {
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index f7b7177cced3..7f6d8e6f9dff 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -84,6 +84,21 @@ struct bch_nvmpg_set {
+ 	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
+ 	       sizeof(struct bch_nvmpg_recs)))
+ 
++
++/* For bkey PTR_OFFSET to nvmpg namespace ID and offset convertion.
++ *
++ * PTR_OFFSET is 43 bits, the most significant 3 bits are for
++ * namespace ID. Rested 40 bits are for per-namespace offset
++ * in sectors.
++ */
++#define BCH_BKEY_OFFSET_NS_ID_BITS	3
++#define BCH_BKEY_OFFSET_NS_ID_MASK	((1UL<<BCH_BKEY_OFFSET_NS_ID_BITS) - 1)
++#define BCH_BKEY_OFFSET_BITS		40
++#define BCH_BKEY_OFFSET_MASK		((1UL<<BCH_BKEY_OFFSET_BITS) - 1)
++
++#define BCH_DATA_BUCKET			0
++#define BCH_META_BUCKET			1
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ 
+@@ -96,6 +111,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+ struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset);
++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset);
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ 
+ #else
+ 
+@@ -123,6 +144,37 @@ static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
+ 	return NULL;
+ }
+ 
++static inline void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset)
++{
++	return NULL;
++}
++
++static inline struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++	return NULL;
++}
++
++static inline unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset)
++{
++	return 0;
++}
++
++static inline unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset)
++{
++	return 0;
++}
++
++static inline void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++	return NULL;
++}
++
++static inline unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++	return 0;
++}
++
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 723791250070..64b517e8136a 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -512,7 +512,8 @@ static int __uuid_write(struct cache_set *c)
+ 	closure_init_stack(&cl);
+ 	lockdep_assert_held(&bch_register_lock);
+ 
+-	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
++	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key,
++				 true, BCH_META_BUCKET))
+ 		return 1;
+ 
+ 	size =  meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..a3700f6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From bbb3b719dfc6070a5807bf6494f858e9e2f4f609 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH 01/12] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+  This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+  This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+  This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+  This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's allocation record list which is headed by the
+above struct bch_nvmpg_head.
+
+- struct bch_nvmpg_rec
+  This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+  - Bits  0 - 51: pgoff - is pages offset of the allocated pages.
+  - Bits 52 - 57: order - allocated size in page_size * order-of-2
+  - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+  - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special format.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..e9eb6371fd78
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ *   This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ *   This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ *   This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ *   This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's allocation record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ *   This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ *      |        bch_nvmpg_header         |
++ *      |                                 |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ *      |         bch_nvmpg_recs          |
++ *      |  (nvm pages internal usage)     |
++ * 24KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ *   All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ *       [highest 3 bits: ns_id]
++ *       [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ *    highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ *    address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET		4096
++#define BCH_NVMPG_START			(16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE		32
++#define BCH_NVMPG_NS_MAX		8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET	(8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET	(16<<10)
++
++#define BCH_NVMPG_SB_VERSION		0
++#define BCH_NVMPG_SB_VERSION_MAX	0
++
++static const __u8 bch_nvmpg_magic[] = {
++	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++	union {
++		struct {
++			__u64	pgoff:52;
++			__u64	order:6;
++			__u64	ns_id:3;
++			__u64	reserved:3;
++		};
++		__u64	_v;
++	};
++};
++
++struct bch_nvmpg_recs {
++	union {
++		struct {
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvmpg_head
++			 */
++			__u64			head_offset;
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvm_pgalloc_recs which contains
++			 * the next recs[] array.
++			 */
++			__u64			next_offset;
++			__u8			magic[16];
++			__u8			uuid[16];
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_rec	recs[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_RECS				\
++	((sizeof(struct bch_nvmpg_recs) -		\
++	  offsetof(struct bch_nvmpg_recs, recs)) /	\
++	 sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE		0x0
++#define BCH_NVMPG_HD_STAT_ALLOC		0x1
++struct bch_nvmpg_head {
++	__u8		uuid[16];
++	__u8		label[BCH_NVMPG_LBL_SIZE];
++	__u32		state;
++	__u32		flags;
++	/*
++	 * Array of offset values from the nvmpg offset format
++	 * pointers, each of the pointer points to a per-namespace
++	 * struct bch_nvmpg_recs.
++	 */
++	__u64		recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++	union {
++		struct {
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_head	heads[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_HEADS					\
++	((sizeof(struct bch_nvmpg_set_header) -			\
++	  offsetof(struct bch_nvmpg_set_header, heads)) /	\
++	 sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++	__u64			csum;
++	__u64			sb_offset;
++	__u64			ns_start;
++	__u64			version;
++	__u8			magic[16];
++	__u8			uuid[16];
++	__u32			page_size;
++	__u32			total_ns;
++	__u32			this_ns;
++	union {
++		__u8		set_uuid[16];
++		__u64		set_magic;
++	};
++
++	__u64			flags;
++	__u64			seq;
++
++	__u64			feature_compat;
++	__u64			feature_incompat;
++	__u64			feature_ro_compat;
++
++	/* For allocable nvm pages from buddy systems */
++	__u64			pages_offset;
++	__u64			pages_total;
++
++	__u64			pad[8];
++
++	/*
++	 * A nvmpg offset format pointer, it points
++	 * to struct bch_nvmpg_set_header which is
++	 * stored only on the first name space.
++	 */
++	__u64			set_header_offset;
++
++	/* Just for csum_set() */
++	__u32			keys;
++	__u64			d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..ff4445c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,542 @@
+From a13fa68537fa67df106e366c0e1cd35d4e715feb Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH 02/12] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig  |  10 ++
+ drivers/md/bcache/Makefile |   1 +
+ drivers/md/bcache/nvmpg.c  | 340 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h  |  97 +++++++++++
+ drivers/md/bcache/super.c  |   3 +
+ 5 files changed, 451 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ 	device path into this file will returns immediately and the real
+ 	registration work is handled in kernel work queue in asynchronous
+ 	way.
++
++config BCACHE_NVM_PAGES
++	bool "NVDIMM support for bcache (EXPERIMENTAL)"
++	depends on BCACHE
++	depends on 64BIT
++	depends on LIBNVDIMM
++	depends on DAX
++	help
++	  Allocate/release NV-memory pages for bcache and provide allocated pages
++	  for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
+ bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+ 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ 	util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..1dd321e4c280
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,340 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (offset == 0)
++		return NULL;
++
++	ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (ns)
++		return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++	pr_err("Invalid ns_id %u\n", ns_id);
++	return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++	int i;
++	struct bch_nvmpg_ns *ns;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		ns = set->ns_tbl[i];
++		if (ns) {
++			fs_put_dax(ns->dax_dev);
++			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++			set->ns_tbl[i] = NULL;
++			set->attached_ns--;
++			kfree(ns);
++		}
++	}
++
++	if (set->attached_ns)
++		pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++	release_ns_tbl(set);
++	kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_set_header *set_header;
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++		       ns->ns_id);
++		return -EINVAL;
++	}
++
++	set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++	mutex_lock(&global_nvmpg_set->lock);
++	global_nvmpg_set->set_header = set_header;
++	global_nvmpg_set->heads_size = set_header->size;
++	global_nvmpg_set->heads_used = set_header->used;
++	mutex_unlock(&global_nvmpg_set->lock);
++
++	return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_sb *sb = ns->sb;
++	int rc = 0;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++		pr_err("ns_id %u already attached.\n", ns->ns_id);
++		rc = -EEXIST;
++		goto unlock;
++	}
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	if (global_nvmpg_set->attached_ns > 0) {
++		pr_err("multiple namespace attaching not supported yet\n");
++		rc = -EOPNOTSUPP;
++		goto unlock;
++	}
++
++	if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++		pr_err("namespace counters error: attached %u > total %u\n",
++		       global_nvmpg_set->attached_ns,
++		       global_nvmpg_set->total_ns);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++	global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++	global_nvmpg_set->attached_ns++;
++	global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++				  struct bch_nvmpg_ns *ns)
++{
++	struct page *page;
++	struct bch_nvmpg_sb *sb;
++	uint64_t expected_csum = 0;
++	int r;
++
++	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++				BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++	if (IS_ERR(page))
++		return -EIO;
++
++	sb = (struct bch_nvmpg_sb *)
++	     (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++	r = -EINVAL;
++	expected_csum = csum_set(sb);
++	if (expected_csum != sb->csum) {
++		pr_info("csum is not match with expected one\n");
++		goto put_page;
++	}
++
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_info("invalid bch_nvmpg_magic\n");
++		goto put_page;
++	}
++
++	if (sb->sb_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++		pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++		goto put_page;
++	}
++
++	r = -EOPNOTSUPP;
++	if (sb->total_ns != 1) {
++		pr_info("multiple name space not supported yet.\n");
++		goto put_page;
++	}
++
++
++	r = 0;
++	/* Necessary for DAX mapping */
++	ns->page_size = sb->page_size;
++	ns->pages_total = sb->pages_total;
++
++put_page:
++	put_page(page);
++	return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	struct bch_nvmpg_ns *ns = NULL;
++	struct bch_nvmpg_sb *sb = NULL;
++	char buf[BDEVNAME_SIZE];
++	struct block_device *bdev;
++	pgoff_t pgoff;
++	int id, err;
++	char *path;
++	long dax_ret = 0;
++
++	path = kstrndup(dev_path, 512, GFP_KERNEL);
++	if (!path) {
++		pr_err("kstrndup failed\n");
++		return ERR_PTR(-ENOMEM);
++	}
++
++	bdev = blkdev_get_by_path(strim(path),
++				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
++				  global_nvmpg_set);
++	if (IS_ERR(bdev)) {
++		pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++		kfree(path);
++		return ERR_PTR(PTR_ERR(bdev));
++	}
++
++	err = -ENOMEM;
++	ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++	if (!ns)
++		goto bdput;
++
++	err = -EIO;
++	if (read_nvdimm_meta_super(bdev, ns)) {
++		pr_err("%s read nvdimm meta super block failed.\n",
++		       bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EOPNOTSUPP;
++	ns->dax_dev = fs_dax_get_by_bdev(bdev);
++	if (!ns->dax_dev) {
++		pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++			   bdev_nr_sectors(bdev))) {
++		pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++		pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	id = dax_read_lock();
++	dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++				    &ns->base_addr, &ns->start_pfn);
++	if (dax_ret <= 0) {
++		pr_err("dax_direct_access error\n");
++		dax_read_unlock(id);
++		goto free_ns;
++	}
++
++	if (dax_ret < ns->pages_total) {
++		pr_warn("currently first %ld pages (from %lu in total) are used\n",
++			dax_ret, ns->pages_total);
++	}
++	dax_read_unlock(id);
++
++	sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++	err = -EINVAL;
++	/* Check magic again to make sure DAX mapping is correct */
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++		goto free_ns;
++	}
++
++	if ((global_nvmpg_set->attached_ns > 0) &&
++	     memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++		pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++		goto free_ns;
++	}
++
++	if (sb->set_header_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++		pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++		       sb->this_ns,
++		       BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++		       BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++		goto free_ns;
++	}
++
++	ns->page_size = sb->page_size;
++	ns->pages_offset = sb->pages_offset;
++	ns->pages_total = sb->pages_total;
++	ns->sb = sb;
++	ns->free = 0;
++	ns->bdev = bdev;
++	ns->set = global_nvmpg_set;
++
++	err = attach_nvmpg_set(ns);
++	if (err < 0)
++		goto free_ns;
++
++	mutex_init(&ns->lock);
++
++	err = init_nvmpg_set_header(ns);
++	if (err < 0)
++		goto free_ns;
++
++	kfree(path);
++	return ns;
++
++free_ns:
++	fs_put_dax(ns->dax_dev);
++	kfree(ns);
++bdput:
++	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++	kfree(path);
++	return ERR_PTR(err);
++}
++
++int __init bch_nvmpg_init(void)
++{
++	global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++	if (!global_nvmpg_set)
++		return -ENOMEM;
++
++	global_nvmpg_set->total_ns = 0;
++	mutex_init(&global_nvmpg_set->lock);
++
++	pr_info("bcache nvm init\n");
++	return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++	release_nvmpg_set(global_nvmpg_set);
++	pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++	struct bch_nvmpg_sb *sb;
++	void *base_addr;
++
++	unsigned char uuid[16];
++	int ns_id;
++	unsigned int page_size;
++	unsigned long free;
++	unsigned long pages_offset;
++	unsigned long pages_total;
++	pfn_t start_pfn;
++
++	struct dax_device *dax_dev;
++	struct block_device *bdev;
++	struct bch_nvmpg_set *set;
++
++	struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++	unsigned char set_uuid[16];
++
++	int heads_size;
++	int heads_used;
++	struct bch_nvmpg_set_header *set_header;
++
++	struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++	int total_ns;
++	int attached_ns;
++
++	struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS	3
++#define BCH_NVMPG_OFFSET_BITS	61
++#define BCH_NVMPG_NS_ID_MASK	((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK	((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset)					\
++	(((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset)	((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset)					\
++	((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) |	\
++	 ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++	return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 86b9e355c583..74d51a0b806f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+ 
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2818,6 +2819,7 @@ static void bcache_exit(void)
+ {
+ 	bch_debug_exit();
+ 	bch_request_exit();
++	bch_nvmpg_exit();
+ 	if (bcache_kobj)
+ 		kobject_put(bcache_kobj);
+ 	if (bcache_wq)
+@@ -2916,6 +2918,7 @@ static int __init bcache_init(void)
+ 
+ 	bch_debug_init();
+ 	closure_debug_init();
++	bch_nvmpg_init();
+ 
+ 	bcache_is_reboot = false;
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..784b84b
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From eabc025702499684f588f362099f47998d0fde63 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH 03/12] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+	struct {        /* ZONE_DEVICE pages */
+		/** @pgmap: Points to the hosting device page map. */
+		struct dev_pagemap *pgmap;
+		void *zone_device_data;
+		/*
+		 * ZONE_DEVICE private pages are counted as being
+		 * mapped so the next 3 words hold the mapping, index,
+		 * and private fields from the source anonymous or
+		 * page cache page while the page is migrated to device
+		 * private memory.
+		 * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+		 * use the mapping, index, and private fields when
+		 * pmem backed DAX files are mapped.
+		 */
+	};
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h |  12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 1dd321e4c280..80e12e06f6d3 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ 	return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+ 
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++	return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++	return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++	pgoff_t pgoff = r->pgoff;
++
++	return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++				       pgoff_t pgoff, u64 nr)
++{
++	while (nr > 0) {
++		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++		bitmap_set(ns->pages_bitmap, pgoff, num);
++		nr -= num;
++		pgoff += num;
++	}
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ 	int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ 	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ 		ns = set->ns_tbl[i];
+ 		if (ns) {
++			kvfree(ns->pages_bitmap);
++			if (ns->recs_bitmap)
++				bitmap_free(ns->recs_bitmap);
++
+ 			fs_put_dax(ns->dax_dev);
+ 			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ 			set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ 	kfree(set);
+ }
+ 
++static int validate_recs(int ns_id,
++			 struct bch_nvmpg_head *head,
++			 struct bch_nvmpg_recs *recs)
++{
++	if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++		pr_err("Invalid bch_nvmpg_recs magic\n");
++		return -EINVAL;
++	}
++
++	if (memcmp(recs->uuid, head->uuid, 16)) {
++		pr_err("Invalid bch_nvmpg_recs uuid\n");
++		return -EINVAL;
++	}
++
++	if (recs->head_offset !=
++	    bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++		pr_err("Invalid recs head_offset\n");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++	int i, used = 0;
++
++	for (i = 0; i < recs->size; i++) {
++		struct bch_nvmpg_rec *r = &recs->recs[i];
++		struct bch_nvmpg_ns *ns;
++		struct page *page;
++		void *addr;
++
++		if (r->pgoff == 0)
++			continue;
++
++		ns = global_nvmpg_set->ns_tbl[r->ns_id];
++		addr = bch_nvmpg_rec_to_ptr(r);
++		if (addr < ns->base_addr) {
++			pr_err("Invalid recorded address\n");
++			return -EINVAL;
++		}
++
++		/* init struct page: index/private */
++		page = bch_nvmpg_va_to_pg(addr);
++		set_page_private(page, r->order);
++		page->index = r->pgoff;
++
++		reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++		used++;
++	}
++
++	if (used != recs->used) {
++		pr_err("used %d doesn't match recs->used %d\n",
++		       used, recs->used);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_set_header *set_header;
++	struct bch_nvmpg_recs *sys_recs;
++	int i, j, used = 0, rc = 0;
+ 
+ 	if (ns->ns_id != 0) {
+ 		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	global_nvmpg_set->set_header = set_header;
+ 	global_nvmpg_set->heads_size = set_header->size;
+ 	global_nvmpg_set->heads_used = set_header->used;
++
++	/* Reserve the used space from buddy allocator */
++	reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++	sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *head;
++
++		head = &set_header->heads[i];
++		if (head->state == BCH_NVMPG_HD_STAT_FREE)
++			continue;
++
++		used++;
++		if (used > global_nvmpg_set->heads_size) {
++			pr_err("used heads %d > heads size %d.\n",
++			       used, global_nvmpg_set->heads_size);
++			goto unlock;
++		}
++
++		for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++			/* Iterate the recs list */
++			while (recs) {
++				rc = validate_recs(j, head, recs);
++				if (rc < 0)
++					goto unlock;
++
++				rc = reserve_nvmpg_recs(recs);
++				if (rc < 0)
++					goto unlock;
++
++				bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++				recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++			}
++		}
++	}
++unlock:
+ 	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
+ 
+-	return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++	unsigned int start, end, pages;
++	int i;
++	struct page *page;
++	pgoff_t pgoff_start;
++
++	bitmap_for_each_clear_region(ns->pages_bitmap,
++				     start, end, 0, ns->pages_total) {
++		pgoff_start = start;
++		pages = end - start;
++
++		while (pages) {
++			void *addr;
++
++			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++				if ((pgoff_start % (1L << i) == 0) &&
++				    (pages >= (1L << i)))
++					break;
++			}
++
++			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++			page = bch_nvmpg_va_to_pg(addr);
++			set_page_private(page, i);
++			page->index = pgoff_start;
++			__SetPageBuddy(page);
++			list_add((struct list_head *)&page->zone_device_data,
++				 &ns->free_area[i]);
++
++			pgoff_start += 1L << i;
++			pages -= 1L << i;
++		}
++	}
+ }
+ 
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	char buf[BDEVNAME_SIZE];
+ 	struct block_device *bdev;
+ 	pgoff_t pgoff;
+-	int id, err;
++	int id, i, err;
+ 	char *path;
+ 	long dax_ret = 0;
+ 
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 
+ 	mutex_init(&ns->lock);
+ 
++	/*
++	 * parameters of bitmap_set/clear are unsigned int.
++	 * Given currently size of nvm is far from exceeding this limit,
++	 * so only add a WARN_ON message.
++	 */
++	WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++	ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++				    sizeof(unsigned long), GFP_KERNEL);
++	if (!ns->pages_bitmap) {
++		err = -ENOMEM;
++		goto clear_ns_nr;
++	}
++
++	if (ns->sb->this_ns == 0) {
++		ns->recs_bitmap =
++			bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++		if (ns->recs_bitmap == NULL) {
++			err = -ENOMEM;
++			goto free_pages_bitmap;
++		}
++	}
++
++	for (i = 0; i < BCH_MAX_ORDER; i++)
++		INIT_LIST_HEAD(&ns->free_area[i]);
++
+ 	err = init_nvmpg_set_header(ns);
+ 	if (err < 0)
+-		goto free_ns;
++		goto free_recs_bitmap;
++
++	if (ns->sb->this_ns == 0)
++		/* init buddy allocator */
++		bch_nvmpg_init_free_space(ns);
+ 
+ 	kfree(path);
+ 	return ns;
+ 
++free_recs_bitmap:
++	bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++	kvfree(ns->pages_bitmap);
++clear_ns_nr:
++	global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ 	fs_put_dax(ns->dax_dev);
+ 	kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+  * Bcache NVDIMM in memory data structures
+  */
+ 
++#define BCH_MAX_ORDER 20
++
+ /*
+  * The following three structures in memory records which page(s) allocated
+  * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ 	unsigned long pages_total;
+ 	pfn_t start_pfn;
+ 
++	unsigned long *pages_bitmap;
++	struct list_head free_area[BCH_MAX_ORDER];
++
++	unsigned long *recs_bitmap;
++
+ 	struct dax_device *dax_dev;
+ 	struct block_device *bdev;
+ 	struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_MAX_PGALLOC_RECS						\
++	(min_t(unsigned int, 64,					\
++	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
++	       sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..94dc417
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,308 @@
+From badd2b9151913efdc34e68b532ca0e6360d5ba1b Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h |   9 ++
+ 2 files changed, 230 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 80e12e06f6d3..ca8ffcec9b2c 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ 	return NULL;
+ }
+ 
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++	return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ 	int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ 	return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+ 
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++					       pgoff_t pgoff)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = pgoff << PAGE_SHIFT;
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ 	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 	}
+ }
+ 
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++	struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++	struct bch_nvmpg_head *head = NULL;
++	int i;
++
++	if (set_header == NULL)
++		goto out;
++
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *h = &set_header->heads[i];
++
++		if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++			continue;
++
++		if (!memcmp(uuid, h->uuid, 16)) {
++			head = h;
++			break;
++		}
++	}
++
++	if (!head && create) {
++		u32 used = set_header->used;
++
++		if (set_header->size > used) {
++			head = &set_header->heads[used];
++			memset(head, 0, sizeof(struct bch_nvmpg_head));
++			head->state = BCH_NVMPG_HD_STAT_ALLOC;
++			memcpy(head->uuid, uuid, 16);
++			global_nvmpg_set->heads_used++;
++			set_header->used++;
++		} else
++			pr_info("No free bch_nvmpg_head\n");
++	}
++
++out:
++	return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++	unsigned int start;
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++	struct bch_nvmpg_recs *recs;
++
++	start = bitmap_find_next_zero_area(ns->recs_bitmap,
++					   BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++	if (start > BCH_MAX_PGALLOC_RECS) {
++		pr_info("No free struct bch_nvmpg_recs\n");
++		return NULL;
++	}
++
++	bitmap_set(ns->recs_bitmap, start, 1);
++	recs = (struct bch_nvmpg_recs *)
++		bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++	       + start;
++
++	memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++	return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++					      struct bch_nvmpg_head *head,
++					      bool create)
++{
++	int ns_id = ns->sb->this_ns;
++	struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++	recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++	/* If create=false, we return recs[nr] */
++	if (!create)
++		return recs;
++
++	/*
++	 * If create=true, it mean we need a empty struct bch_nvmpg_rec
++	 * So we should find non-empty struct bch_nvmpg_recs or alloc
++	 * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++	 */
++	while (recs && (recs->used == recs->size)) {
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++
++	/* Found empty struct bch_nvmpg_recs */
++	if (recs)
++		return recs;
++
++	/* Need alloc new struct bch_nvmpg_recs */
++	recs = find_empty_nvmpg_recs();
++	if (recs) {
++		unsigned long offset;
++
++		recs->next_offset = 0;
++		recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++		memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++		memcpy(recs->uuid, head->uuid, 16);
++		recs->size = BCH_NVMPG_MAX_RECS;
++		recs->used = 0;
++
++		offset = bch_nvmpg_ptr_to_offset(ns, recs);
++		if (prev_recs)
++			prev_recs->next_offset = offset;
++		else
++			head->recs_offset[ns_id] = offset;
++	}
++
++	return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++			  struct bch_nvmpg_recs *recs,
++			  unsigned long nvmpg_offset,
++			  int order)
++{
++	int i, ns_id;
++	unsigned long pgoff;
++
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++	ns_id = ns->sb->this_ns;
++
++	for (i = 0; i < recs->size; i++) {
++		if (recs->recs[i].pgoff == 0) {
++			recs->recs[i].pgoff = pgoff;
++			recs->recs[i].order = order;
++			recs->recs[i].ns_id = ns_id;
++			recs->used++;
++			break;
++		}
++	}
++	BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	unsigned long nvmpg_offset = 0;
++	struct bch_nvmpg_head *head;
++	int n, o;
++
++	mutex_lock(&global_nvmpg_set->lock);
++	head = find_nvmpg_head(uuid, true);
++
++	if (!head) {
++		pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++		goto unlock;
++	}
++
++	for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++		struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++		if (!ns || (ns->free < (1L << order)))
++			continue;
++
++		for (o = order; o < BCH_MAX_ORDER; o++) {
++			struct list_head *list;
++			struct page *page, *buddy_page;
++
++			if (list_empty(&ns->free_area[o]))
++				continue;
++
++			list = ns->free_area[o].next;
++			page = container_of((void *)list, struct page,
++					    zone_device_data);
++
++			list_del(list);
++
++			while (o != order) {
++				void *addr;
++				pgoff_t pgoff;
++
++				pgoff = page->index + (1L << (o - 1));
++				addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++				buddy_page = bch_nvmpg_va_to_pg(addr);
++				set_page_private(buddy_page, o - 1);
++				buddy_page->index = pgoff;
++				__SetPageBuddy(buddy_page);
++				list_add((struct list_head *)&buddy_page->zone_device_data,
++					 &ns->free_area[o - 1]);
++				o--;
++			}
++
++			set_page_private(page, order);
++			__ClearPageBuddy(page);
++			ns->free -= 1L << order;
++			nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++			break;
++		}
++
++		if (o < BCH_MAX_ORDER) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = find_nvmpg_recs(ns, head, true);
++			/* ToDo: handle pgalloc_recs==NULL */
++			add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++			break;
++		}
++	}
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return nvmpg_offset;
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_PGOFF_TO_KVADDR(pgoff)					\
++	((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS						\
+ 	(min_t(unsigned int, 64,					\
+ 	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ 
+ #else
+ 
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+ 
+ static inline void bch_nvmpg_exit(void) { }
+ 
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
new file mode 100644
index 0000000..4ac1234
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch
@@ -0,0 +1,251 @@
+From 7eac3b1797acdd2ff3c684c9fabd7fe12bd671c6 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH 05/12] bcache: bch_nvmpg_free_pages() of the buddy allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h |   3 +
+ 2 files changed, 160 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index ca8ffcec9b2c..9864436a45cc 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	return rc;
+ }
+ 
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++			 int order)
++{
++	unsigned long add_pages = (1L << order);
++	pgoff_t pgoff;
++	struct page *page;
++	void *va;
++
++	if (nvmpg_offset == 0) {
++		pr_err("free pages on offset 0\n");
++		return;
++	}
++
++	page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++	WARN_ON((!page) || (page->private != order));
++	pgoff = page->index;
++
++	while (order < BCH_MAX_ORDER - 1) {
++		struct page *buddy_page;
++
++		pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++		pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++		if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++			break;
++
++		va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++		buddy_page = bch_nvmpg_va_to_pg(va);
++		WARN_ON(!buddy_page);
++
++		if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++			list_del((struct list_head *)&buddy_page->zone_device_data);
++			__ClearPageBuddy(buddy_page);
++			pgoff = parent_pgoff;
++			order++;
++			continue;
++		}
++		break;
++	}
++
++	va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++	page = bch_nvmpg_va_to_pg(va);
++	WARN_ON(!page);
++	list_add((struct list_head *)&page->zone_device_data,
++		 &ns->free_area[order]);
++	page->index = pgoff;
++	set_page_private(page, order);
++	__SetPageBuddy(page);
++	ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ 	unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 		pages = end - start;
+ 
+ 		while (pages) {
+-			void *addr;
+-
+ 			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ 				if ((pgoff_start % (1L << i) == 0) &&
+ 				    (pages >= (1L << i)))
+ 					break;
+ 			}
+ 
+-			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+-			page = bch_nvmpg_va_to_pg(addr);
++			page = bch_nvmpg_va_to_pg(
++					bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ 			set_page_private(page, i);
+ 			page->index = pgoff_start;
+-			__SetPageBuddy(page);
+-			list_add((struct list_head *)&page->zone_device_data,
+-				 &ns->free_area[i]);
++
++			/* In order to update ns->free */
++			__free_space(ns, pgoff_start, i);
+ 
+ 			pgoff_start += 1L << i;
+ 			pages -= 1L << i;
+@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return nvmpg_offset;
+ }
+ 
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++				     void *start_addr, void *end_addr)
++{
++	return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++			    unsigned long nvmpg_offset, int order)
++{
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *prev_recs, *sys_recs;
++	struct bch_nvmpg_ns *ns;
++	unsigned long pgoff;
++	int i;
++
++	ns = global_nvmpg_set->ns_tbl[0];
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++	head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++	prev_recs = recs;
++	sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++	while (recs) {
++		for (i = 0; i < recs->size; i++) {
++			struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++			if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++				WARN_ON(rec->order != order);
++				rec->_v = 0;
++				recs->used--;
++
++				if (recs->used == 0) {
++					int recs_pos = recs - sys_recs;
++
++					if (recs == prev_recs)
++						head->recs_offset[ns_id] =
++							recs->next_offset;
++					else
++						prev_recs->next_offset =
++							recs->next_offset;
++
++					recs->next_offset = 0;
++					recs->head_offset = 0;
++
++					bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++				}
++				goto out;
++			}
++		}
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++out:
++	return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++			  const char *uuid)
++{
++	struct bch_nvmpg_ns *ns;
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *recs;
++	int r;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++	if (!ns) {
++		pr_err("can't find namespace by given kaddr from namespace\n");
++		goto unlock;
++	}
++
++	head = find_nvmpg_head(uuid, false);
++	if (!head) {
++		pr_err("can't found bch_nvmpg_head by uuid\n");
++		goto unlock;
++	}
++
++	recs = find_nvmpg_recs(ns, head, false);
++	if (!recs) {
++		pr_err("can't find bch_nvmpg_recs by uuid\n");
++		goto unlock;
++	}
++
++	r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++	if (r < 0) {
++		pr_err("can't find bch_nvmpg_rec\n");
++		goto unlock;
++	}
++
++	__free_space(ns, nvmpg_offset, order);
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	ns->pages_offset = sb->pages_offset;
+ 	ns->pages_total = sb->pages_total;
+ 	ns->sb = sb;
++	/* increase by __free_space() */
+ 	ns->free = 0;
+ 	ns->bdev = bdev;
+ 	ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ 
+ #else
+ 
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return 0;
+ }
+ 
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
new file mode 100644
index 0000000..0a77f35
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch
@@ -0,0 +1,66 @@
+From 3440789a920beb6e63493eecde279b6902ac0a1a Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH 06/12] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 5 +++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 9864436a45cc..3c50cb09bb7a 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return nvmpg_offset;
+ }
+ 
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return find_nvmpg_head(uuid, false);
++}
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ 	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+ 
+ #else
+ 
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+ 
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
new file mode 100644
index 0000000..f2880af
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch
@@ -0,0 +1,48 @@
+From 80d34e8aba0591ad58f1c3336333b48c715e3a69 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH 07/12] bcache: use bucket index to set GC_MARK_METADATA for
+ journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 88c573eeb598..1a0ff117373f 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ 	ca = c->cache;
+ 	ca->invalidate_needs_gc = 0;
+ 
+-	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+-		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++	/* Range [first_bucket, first_bucket + keys) is for journal buckets */
++	for (i = ca->sb.first_bucket;
++	     i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++		SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+ 
+ 	for (k = ca->prio_buckets;
+ 	     k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
new file mode 100644
index 0000000..30de10c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch
@@ -0,0 +1,60 @@
+From c006ab9655e4834a858bb399e1bcd8a51668d79c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET		0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE	0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META		0x0004
+ 
+ #define BCH_FEATURE_COMPAT_SUPP		0
+ #define BCH_FEATURE_RO_COMPAT_SUPP	0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++					 BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ 					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+ 
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ 		((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+ 
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+ 
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
new file mode 100644
index 0000000..a56c25c
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch
@@ -0,0 +1,255 @@
+From 09fdf9edf79edd718035e6d9afa75f80f1d3a330 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH 09/12] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+  list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+  from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h |   2 +-
+ drivers/md/bcache/nvmpg.c   |   9 +++
+ drivers/md/bcache/nvmpg.h   |   1 +
+ drivers/md/bcache/super.c   |  18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+ 
+ #include <trace/events/bcache.h>
+ 
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+ 
+ 	return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++					     struct cache *ca)
++{
++	unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++	unsigned long ret_offset = 0;
++	int i;
++
++	jnl_offset = (unsigned long)ca->sb.d[0];
++	jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++	jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		struct bch_nvmpg_recs *recs;
++		struct bch_nvmpg_rec *rec;
++		unsigned long recs_offset = 0;
++		int j;
++
++		recs_offset = nvmpg_head->recs_offset[i];
++		recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		while (recs) {
++			for (j = 0; j < recs->size; j++) {
++				rec = &recs->recs[j];
++				if ((rec->pgoff != jnl_pgoff) ||
++				    (rec->ns_id != jnl_ns_id))
++					continue;
++
++				ret_offset = jnl_offset;
++				goto out;
++			}
++			recs_offset = recs->next_offset;
++			recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		}
++	}
++
++out:
++	return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++	struct bch_nvmpg_head *head = NULL;
++	unsigned long nvmpg_offset;
++	int order;
++
++	head = bch_get_nvmpg_head(ca->sb.set_uuid);
++	if (head) {
++		nvmpg_offset = find_journal_nvmpg_base(head, ca);
++		if (nvmpg_offset)
++			goto found;
++	}
++
++	order = ilog2((ca->sb.bucket_size *
++		       ca->sb.njournal_buckets) / PAGE_SECTORS);
++	nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++	if (nvmpg_offset)
++		memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++		       0, (1 << order) * PAGE_SIZE);
++found:
++	return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++	int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	int i;
++	unsigned long jnl_base = 0;
++
++	jnl_base = get_journal_nvmpg_space(ca);
++	if (!jnl_base) {
++		pr_err("Failed to get journal space from nvdimm\n");
++		goto out;
++	}
++
++	/* Iniialized and reloaded from on-disk super block already */
++	if (ca->sb.d[0] != 0)
++		goto out;
++
++	for (i = 0; i < ca->sb.keys; i++)
++		ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++	ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++	return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++	int i, ret = 0;
++	struct cache *ca = c->cache;
++
++	ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++			      2, SB_JOURNAL_BUCKETS);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++		for (i = 0; i < ca->sb.keys; i++)
++			ca->sb.d[i] = ca->sb.first_bucket + i;
++	} else
++		ret = __bch_journal_nvdimm_init(ca);
++
++	return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+ 
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 3c50cb09bb7a..2d0808a83f86 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+ 
+ struct bch_nvmpg_set *global_nvmpg_set;
+ 
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++	if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++		return global_nvmpg_set->ns_tbl[ns_id];
++
++	pr_emerg("Invalid ns_id: %d\n", ns_id);
++	return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ 	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+ 
+ #else
+ 
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 74d51a0b806f..a27fa65d8832 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb,  struct block_device *
+ 		goto err;
+ 
+ 	err = "Journal buckets not sequential";
+-	for (i = 0; i < sb->keys; i++)
+-		if (sb->d[i] != sb->first_bucket + i)
+-			goto err;
++	if (!bch_has_feature_nvdimm_meta(sb)) {
++		for (i = 0; i < sb->keys; i++)
++			if (sb->d[i] != sb->first_bucket + i)
++				goto err;
++	}
+ 
+ 	err = "Too many journal buckets";
+ 	if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c)
+ 		if (bch_journal_replay(c, &journal))
+ 			goto err;
+ 	} else {
+-		unsigned int j;
+-
+ 		pr_notice("invalidating existing data\n");
+-		ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+-					2, SB_JOURNAL_BUCKETS);
+-
+-		for (j = 0; j < ca->sb.keys; j++)
+-			ca->sb.d[j] = ca->sb.first_bucket + j;
++		err = "error initializing journal";
++		if (bch_journal_init(c))
++			goto err;
+ 
+ 		bch_initial_gc_finish(c);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
new file mode 100644
index 0000000..99e53f3
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch
@@ -0,0 +1,231 @@
+From ab08690b14942f881d545539e83762a6fa794131 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH 10/12] bcache: support storing bcache journal into NVDIMM meta
+ device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+   The latency method to compose bio and issue the jset bio to cache
+   device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+   device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+   Use memory interface to access NVDIMM pages and store the jset by
+   memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+   address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c   |   3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ 		return;
+ 	}
+ 
++	BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ 	switch (atomic_read(&ja->discard_in_flight)) {
+ 	case DISCARD_IN_FLIGHT:
+ 		return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ 		goto out;
+ 
+ 	ja->cur_idx = next;
+-	k->ptr[0] = MAKE_PTR(0,
+-			     bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+-			     ca->sb.nr_this_dev);
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		k->ptr[0] = MAKE_PTR(0,
++			bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++			ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++						ca->sb.d[ja->cur_idx]);
++#endif
++
+ 	atomic_long_inc(&c->reclaimed_journal_buckets);
+ 
+ 	bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ 	spin_unlock(&c->journal.lock);
+ }
+ 
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ 	__releases(c->journal.lock)
+ {
+-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+-	struct cache *ca = c->cache;
+-	struct journal_write *w = c->journal.cur;
+ 	struct bkey *k = &c->journal.key;
+-	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+-		ca->sb.block_size;
+-
++	struct journal_write *w = c->journal.cur;
++	struct closure *cl = &c->journal.io;
++	struct cache *ca = c->cache;
+ 	struct bio *bio;
+ 	struct bio_list list;
++	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++		ca->sb.block_size;
+ 
+ 	bio_list_init(&list);
+ 
+-	if (!w->need_write) {
+-		closure_return_with_destructor(cl, journal_write_unlock);
+-		return;
+-	} else if (journal_full(&c->journal)) {
+-		journal_reclaim(c);
+-		spin_unlock(&c->journal.lock);
+-
+-		btree_flush_write(c);
+-		continue_at(cl, journal_write, bch_journal_wq);
+-		return;
+-	}
+-
+-	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+-	w->data->btree_level = c->root->level;
+-
+-	bkey_copy(&w->data->btree_root, &c->root->key);
+-	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+-	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+-	w->data->magic		= jset_magic(&ca->sb);
+-	w->data->version	= BCACHE_JSET_VERSION;
+-	w->data->last_seq	= last_seq(&c->journal);
+-	w->data->csum		= csum_set(w->data);
+-
+ 	for (i = 0; i < KEY_PTRS(k); i++) {
+ 		ca = c->cache;
+ 		bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ 	}
+-
+ 	/* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ 	BUG_ON(i == 0);
+ 
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 	while ((bio = bio_list_pop(&list)))
+ 		closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++	__releases(c->journal.lock)
++{
++	struct journal_write *w = c->journal.cur;
++	struct cache *ca = c->cache;
++	unsigned int sectors;
++
++	sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++	atomic_long_add(sectors, &ca->meta_sectors_written);
++
++	memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++	c->journal.key.ptr[0] += sectors << 9;
++	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++	atomic_dec_bug(&fifo_back(&c->journal.pin));
++	bch_journal_next(&c->journal);
++	journal_reclaim(c);
++
++	spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++	struct cache *ca = c->cache;
++	struct journal_write *w = c->journal.cur;
++
++	if (!w->need_write) {
++		closure_return_with_destructor(cl, journal_write_unlock);
++		return;
++	} else if (journal_full(&c->journal)) {
++		journal_reclaim(c);
++		spin_unlock(&c->journal.lock);
++
++		btree_flush_write(c);
++		continue_at(cl, journal_write, bch_journal_wq);
++		return;
++	}
++
++	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++	w->data->btree_level = c->root->level;
++
++	bkey_copy(&w->data->btree_root, &c->root->key);
++	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++	w->data->magic		= jset_magic(&ca->sb);
++	w->data->version	= BCACHE_JSET_VERSION;
++	w->data->last_seq	= last_seq(&c->journal);
++	w->data->csum		= csum_set(w->data);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		__journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		__journal_nvdimm_write_unlocked(c);
++#endif
+ 
+ 	continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a27fa65d8832..45b69ddc9cfa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ 	struct cache_set *c = container_of(cl, struct cache_set, cl);
+-	struct cache *ca;
++	struct cache *ca = c->cache;
+ 
+ 	debugfs_remove(c->debug);
+ 
+@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl)
+ 	bch_bset_sort_state_free(&c->sort);
+ 	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+ 
+-	ca = c->cache;
+ 	if (ca) {
+ 		ca->set = NULL;
+ 		c->cache = NULL;
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
new file mode 100644
index 0000000..77a4ae4
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch
@@ -0,0 +1,181 @@
+From 5b9accf31b16f6cc138754d8e77982092094a4ee Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH 11/12] bcache: read jset from NVDIMM pages for journal replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+  This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+  This is the method to read jset from NVDIMM memory interface, a.k.a
+  memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++				    unsigned int len, unsigned int offset,
++				    struct closure *cl)
++{
++	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++	struct bio *bio = &ca->journal.bio;
++	struct jset *data = ca->set->journal.w[0].data;
++
++	bio_reset(bio);
++	bio->bi_iter.bi_sector	= bucket + offset;
++	bio_set_dev(bio, ca->bdev);
++	bio->bi_iter.bi_size	= len << 9;
++
++	bio->bi_end_io	= journal_read_endio;
++	bio->bi_private = cl;
++	bio_set_op_attrs(bio, REQ_OP_READ, 0);
++	bch_bio_map(bio, data);
++
++	closure_bio_submit(ca->set, bio, cl);
++	closure_sync(cl);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++				     unsigned int len, unsigned int offset)
++{
++	void *jset_addr;
++	struct jset *data;
++
++	jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++	data = ca->set->journal.w[0].data;
++
++	memcpy(data, jset_addr, len << 9);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ 			       unsigned int bucket_index)
+ {
+ 	struct journal_device *ja = &ca->journal;
+-	struct bio *bio = &ja->bio;
+ 
+ 	struct journal_replay *i;
+-	struct jset *j, *data = ca->set->journal.w[0].data;
++	struct jset *j;
+ 	struct closure cl;
+ 	unsigned int len, left, offset = 0;
+ 	int ret = 0;
+-	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+ 
+ 	closure_init_stack(&cl);
+ 
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread:		left = ca->sb.bucket_size - offset;
+ 		len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+ 
+-		bio_reset(bio);
+-		bio->bi_iter.bi_sector	= bucket + offset;
+-		bio_set_dev(bio, ca->bdev);
+-		bio->bi_iter.bi_size	= len << 9;
+-
+-		bio->bi_end_io	= journal_read_endio;
+-		bio->bi_private = &cl;
+-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+-		bch_bio_map(bio, data);
+-
+-		closure_bio_submit(ca->set, bio, &cl);
+-		closure_sync(&cl);
++		if (!bch_has_feature_nvdimm_meta(&ca->sb))
++			j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++		/*
++		 * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++		 * support feature set, a cache device format with feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++		 * read_super() by bch_has_unknown_incompat_features().
++		 * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++		 * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++		 */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		else
++			j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+ 
+ 		/* This function could be simpler now since we no longer write
+ 		 * journal entries that overlap bucket boundaries; this means
+ 		 * the start of a bucket will always have a valid journal entry
+ 		 * if it has any journal entries at all.
+ 		 */
+-
+-		j = data;
+ 		while (len) {
+ 			struct list_head *where;
+ 			size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread:		left = ca->sb.bucket_size - offset;
+ 	return ret;
+ }
+ 
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b)							\
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ 	unsigned int i, l, r, m;
+ 	uint64_t seq;
+ 
++	/*
++	 * Linear addresses of NVDIMM pages for journaling is not
++	 * initialized yet, do it before read jset from NVDIMM pages.
++	 */
++	if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++		if (__bch_journal_nvdimm_init(ca) < 0)
++			return -ENXIO;
++	}
++
+ 	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ 	pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
new file mode 100644
index 0000000..0ffc9a7
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch
@@ -0,0 +1,84 @@
+From 55b8876f5fc3a3f097bca7f2b518e0dccd112905 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH 12/12] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+  echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 45b69ddc9cfa..2b9cde44879b 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ 					 struct kobj_attribute *attr,
+ 					 const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++				    struct kobj_attribute *attr,
++				    const char *buffer, size_t size);
++#endif
+ 
+ kobj_attribute_write(register,		register_bcache);
+ kobj_attribute_write(register_quiet,	register_bcache);
+ kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+ 
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args)
+ 	queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+ 
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++				    const char *buffer, size_t size)
++{
++	ssize_t ret = size;
++
++	struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++	if (IS_ERR(ns)) {
++		pr_err("register nvdimm namespace %s for meta device failed.\n",
++			buffer);
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ 			       const char *buffer, size_t size)
+ {
+@@ -2864,6 +2890,9 @@ static int __init bcache_init(void)
+ 	static const struct attribute *files[] = {
+ 		&ksysfs_register.attr,
+ 		&ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		&ksysfs_register_nvdimm_meta.attr,
++#endif
+ 		&ksysfs_pendings_cleanup.attr,
+ 		NULL
+ 	};
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch
new file mode 100644
index 0000000..fa696e6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch
@@ -0,0 +1,125 @@
+From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 12 Dec 2021 23:13:09 +0800
+Subject: [PATCH v13 00/12] bcache for 5.17: enable NVDIMM for bcache journal
+
+Hi Jens,
+
+This is the v12 effort the enabling NVDIMM for bcache journal, the code
+is under testing for months and quite stable now. Please consider to
+take them for Linux v5.17 merge window.
+
+All current code logic and on-media format are consistent with previous
+v12 series. The major difference from v12 series include,
+- more typos in code comments and commit logs are fixed.
+- add kernel message to indicate only first range is used currently if
+  the NVDIMM namespace has multiple mapping ranges.
+- not export nvm-pages allocator APIs, it is unnecessary since currently 
+  only bcache uses them.
+
+Now all previous bcache related UAPI headers are all moved into bcache
+private code directory, there is no global headers exported to neither
+kernel or user source code.
+
+Bcache uses nvm-pages allocator to allocate pages from NVDIMM namespace
+for its journaling space. The nvm-pages allocator is a buddy-like
+allocator, which allocates size in power-of-2 pages from the NVDIMM
+namespace. User space tool 'bcache' has a new added '-M' option to
+format a NVDIMM namespace and register it via sysfs interface as a
+bcache meta device. The nvm-pages allocator code does a DAX mapping to
+map the whole namespace into system's memory address range, and allocate
+the pages to requestion like typical buddy allocator does. The major
+difference is nvm-pages allocator maintains the pages allocated to each
+requester by an allocation list which stored on NVDIMM too. Allocation
+list of different requester is tracked by a pre-defined UUID, all the
+pages tracked in all allocation lists are treated as allocated busy
+pages and won't be initialized into buddy system after the system
+reboots.
+
+The bcache journal code may request a block of power-of-2 size pages
+from the nvm-pages allocator, normally it is a range of 256MB or 512MB
+continuous pages range. During meta data journaling, the in-memory jsets
+go into the calculated nvdimm pages location by kernel memcpy routine.
+So the journaling I/Os won't go into block device (e.g. SSD) anymore,
+the write and read for journal jsets happen on NVDIMM. 
+
+Intel developers Jianpeng Ma and Qiaowei Ren compose the initial code of
+nvm-pages allocator, the related patches are,
+- bcache: initialize the nvm-pages allocator
+- bcache: initialization of the buddy
+- bcache: bch_nvm_alloc_pages() of the buddy
+- bcache: bch_nvm_free_pages() of the buddy
+- bcache: get recs list head for allocated pages by specific uuid
+All the code depends on Linux libnvdimm and dax drivers, the bcache nvm-
+pages allocator can be treated as user of these two drivers.
+
+I modify the bcache code to recognize the nvm meta device feature,
+initialize journal on NVDIMM, and do journal I/Os on NVDIMM in the
+following patches,
+- bcache: add initial data structures for nvm pages
+- bcache: use bucket index to set GC_MARK_METADATA for journal buckets
+  in bch_btree_gc_finish()
+- bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set
+- bcache: initialize bcache journal for NVDIMM meta device
+- bcache: support storing bcache journal into NVDIMM meta device
+- bcache: read jset from NVDIMM pages for journal replay
+- bcache: add sysfs interface register_nvdimm_meta to register NVDIMM
+  meta device
+
+All the code is EXPERIMENTAL, they won't be enabled by default until we
+feel the NVDIMM support is completed and stable. The current code has
+been tested internally for monthes, we don't observe any issue during
+all tests with or without enabling the configuration.
+
+Please consider to pick this series for Linux v5.17 merge window. If
+there is any issue detected, we will response in time and fix them ASAP.
+
+Thank you in advance.
+
+Coly Li
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+
+Coly Li (7):
+  bcache: add initial data structures for nvm pages
+  bcache: use bucket index to set GC_MARK_METADATA for journal buckets
+    in bch_btree_gc_finish()
+  bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set
+  bcache: initialize bcache journal for NVDIMM meta device
+  bcache: support storing bcache journal into NVDIMM meta device
+  bcache: read jset from NVDIMM pages for journal replay
+  bcache: add sysfs interface register_nvdimm_meta to register NVDIMM
+    meta device
+
+Jianpeng Ma (5):
+  bcache: initialize the nvm pages allocator
+  bcache: initialization of the buddy
+  bcache: bch_nvmpg_alloc_pages() of the buddy
+  bcache: bch_nvmpg_free_pages() of the buddy allocator
+  bcache: get recs list head for allocated pages by specific uuid
+
+ drivers/md/bcache/Kconfig        |  10 +
+ drivers/md/bcache/Makefile       |   1 +
+ drivers/md/bcache/btree.c        |   6 +-
+ drivers/md/bcache/features.h     |   9 +
+ drivers/md/bcache/journal.c      | 321 +++++++++--
+ drivers/md/bcache/journal.h      |   2 +-
+ drivers/md/bcache/nvmpg.c        | 931 +++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h        | 128 +++++
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++
+ drivers/md/bcache/super.c        |  53 +-
+ 10 files changed, 1646 insertions(+), 68 deletions(-)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch
new file mode 100644
index 0000000..14b3695
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch
@@ -0,0 +1,343 @@
+From 0ecd02239e1e7fc12115fda644810ee88bf26dff Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 26 Jul 2021 00:26:28 +0800
+Subject: [PATCH v13 01/12] bcache: add initial data structures for nvm pages
+
+This patch initializes the prototype data structures for nvm pages
+allocator,
+
+- struct bch_nvmpg_sb
+  This is the super block allocated on each nvdimm namespace for the nvm
+pages allocator. A nvdimm pages allocator set may have multiple name-
+spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this
+namespace belongs to.
+
+- struct bch_nvmpg_header
+  This is a table for all heads of all allocation record lists. An allo-
+cation record list traces all page(s) allocated from nvdimm namespace(s)
+to a specific requester (identified by uuid). After system reboot, a
+requester can retrieve all previously allocated nvdimm pages from its
+record list by a pre-defined uuid.
+
+- struct bch_nvmpg_head
+  This is a head of an allocation record list. Each nvdimm pages
+requester (typically it's a driver) has and only has one allocation
+record list, and an allocated nvdimm page only belongs to a specific
+allocation record list. Member uuid[] will be set as the requester's
+uuid, e.g. for bcache it is the cache set uuid. Member label is not
+mandatory, it is a human-readable string for debug purpose. The nvm
+offset format pointers recs_offset[] point to the location of actual
+allocator record lists on each namespace of the nvdimm pages allocator
+set. Each per namespace record list is represented by the following
+struct bch_nvmpg_recs.
+
+- struct bch_nvmpg_recs
+  This structure represents a requester's allocation record list. Member
+uuid is same value as the uuid of its corresponding struct
+bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec
+objects to trace all allocated nvmdimm pages. If the table recs[] is
+full, the nvmpg format offset is a pointer points to the next struct
+bch_nvmpg_recs object, nvm pages allocator will look for available free
+allocation record there. All the linked struct bch_nvmpg_recs objects
+compose a requester's allocation record list which is headed by the
+above struct bch_nvmpg_head.
+
+- struct bch_nvmpg_rec
+  This structure records a range of allocated nvdimm pages. Member pgoff
+is offset in unit of page size of this allocation range. Member order
+indicates size of the allocation range by (1 << order) in unit of page
+size. Because the nvdimm pages allocator set may have multiple nvdimm
+namespaces, member ns_id is used to identify which namespace the pgoff
+belongs to.
+  - Bits  0 - 51: pgoff - is pages offset of the allocated pages.
+  - Bits 52 - 57: order - allocated size in page_size * order-of-2
+  - Bits 58 - 60: ns_id - identify which namespace the pages stays on
+  - Bits 61 - 63: reserved.
+Since each of the allocated nvm pages are power of 2, using 6 bits to
+represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum
+value. It can be a 76 bits width range size in byte for 4KB page size,
+which is large enough currently.
+
+All the structure members having _offset suffix are in a special format.
+E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset},
+bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset},
+the offset value is 64bit, the most significant 3 bits are used to
+identify which namespace this offset belongs to, and the rested 61 bits
+are actual offset inside the namespace. Following patches will have
+helper routines to do the conversion between memory pointer and offset.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Ying Huang <ying.huang@intel.com>
+---
+ drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++
+ 1 file changed, 253 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg_format.h
+
+diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h
+new file mode 100644
+index 000000000000..414bcafa31ee
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg_format.h
+@@ -0,0 +1,253 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++
++#ifndef _NVMPG_FORMAT_H
++#define _NVMPG_FORMAT_H
++
++/*
++ * Bcache on NVDIMM data structures
++ */
++
++/*
++ * - struct bch_nvmpg_sb
++ *   This is the super block allocated on each nvdimm namespace for the nvm
++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces,
++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space
++ * belongs to.
++ *
++ * - struct bch_nvmpg_header
++ *   This is a table for all heads of all allocation record lists. An allo-
++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to
++ * a specific requester (identified by uuid). After system reboot, a requester
++ * can retrieve all previously allocated nvdimm pages from its record list by a
++ * pre-defined uuid.
++ *
++ * - struct bch_nvmpg_head
++ *   This is a head of an allocation record list. Each nvdimm pages requester
++ * (typically it's a driver) has and only has one allocation record list, and
++ * an allocated nvdimm page only bedlones to a specific allocation record list.
++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the
++ * cache set uuid. Member label is not mandatory, it is a human-readable string
++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the
++ * location of actual allocator record lists on each name space of the nvdimm
++ * pages allocator set. Each per name space record list is represented by the
++ * following struct bch_nvmpg_recs.
++ *
++ * - struct bch_nvmpg_recs
++ *   This structure represents a requester's allocation record list. Member uuid
++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member
++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated
++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a
++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator
++ * will look for available free allocation record there. All the linked
++ * struct bch_nvmpg_recs objects compose a requester's allocation record list
++ * which is headed by the above struct bch_nvmpg_head.
++ *
++ * - struct bch_nvmpg_rec
++ *   This structure records a range of allocated nvdimm pages. Member pgoff is
++ * offset in unit of page size of this allocation range. Member order indicates
++ * size of the allocation range by (1 << order) in unit of page size. Because
++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member
++ * ns_id is used to identify which name space the pgoff belongs to.
++ *
++ * All allocation record lists are stored on the first initialized nvdimm name-
++ * space (ns_id 0). The meta data default layout of nvm pages allocator on
++ * namespace 0 is,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET
++ *      |        bch_nvmpg_header         |
++ *      |                                 |
++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET
++ *      |         bch_nvmpg_recs          |
++ *      |  (nvm pages internal usage)     |
++ * 24KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ *
++ * Meta data default layout on rested nvdimm namespaces,
++ *
++ *    0 +---------------------------------+
++ *      |                                 |
++ *  4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET
++ *      |          bch_nvmpg_sb           |
++ *  8KB +---------------------------------+
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ *      |                                 |
++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START
++ *      |      allocable nvm pages        |
++ *      |      for buddy allocator        |
++ * end  +---------------------------------+
++ *
++ *
++ * - The nvmpg offset format pointer
++ *   All member names ending with _offset in this header are nvmpg offset
++ * format pointer. The offset format is,
++ *       [highest 3 bits: ns_id]
++ *       [rested 61 bits: offset in No. ns_id namespace]
++ *
++ * The above offset is byte unit, the procedure to reference a nvmpg offset
++ * format pointer is,
++ * 1) Identify the namespace related in-memory structure by ns_id from the
++ *    highest 3 bits of offset value.
++ * 2) Get the DAX mapping base address from the in-memory structure.
++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base
++ *    address with offset value in rested low 61 bits.
++ * All related in-memory structure and conversion routines don't belong to
++ * user space api, they are defined by nvm-pages allocator code in
++ * drivers/md/bcache/nvm-pages.{c,h}
++ *
++ */
++
++#include <linux/types.h>
++
++/* In sectors */
++#define BCH_NVMPG_SB_OFFSET		4096
++#define BCH_NVMPG_START			(16 << 20)
++
++#define BCH_NVMPG_LBL_SIZE		32
++#define BCH_NVMPG_NS_MAX		8
++
++#define BCH_NVMPG_RECLIST_HEAD_OFFSET	(8<<10)
++#define BCH_NVMPG_SYSRECS_OFFSET	(16<<10)
++
++#define BCH_NVMPG_SB_VERSION		0
++#define BCH_NVMPG_SB_VERSION_MAX	0
++
++static const __u8 bch_nvmpg_magic[] = {
++	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
++	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
++static const __u8 bch_nvmpg_recs_magic[] = {
++	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
++	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
++
++/* takes 64bit width */
++struct bch_nvmpg_rec {
++	union {
++		struct {
++			__u64	pgoff:52;
++			__u64	order:6;
++			__u64	ns_id:3;
++			__u64	reserved:3;
++		};
++		__u64	_v;
++	};
++};
++
++struct bch_nvmpg_recs {
++	union {
++		struct {
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvmpg_head
++			 */
++			__u64			head_offset;
++			/*
++			 * A nvmpg offset format pointer to
++			 * struct bch_nvm_pgalloc_recs which contains
++			 * the next recs[] array.
++			 */
++			__u64			next_offset;
++			__u8			magic[16];
++			__u8			uuid[16];
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_rec	recs[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_RECS				\
++	((sizeof(struct bch_nvmpg_recs) -		\
++	  offsetof(struct bch_nvmpg_recs, recs)) /	\
++	 sizeof(struct bch_nvmpg_rec))
++
++#define BCH_NVMPG_HD_STAT_FREE		0x0
++#define BCH_NVMPG_HD_STAT_ALLOC		0x1
++struct bch_nvmpg_head {
++	__u8		uuid[16];
++	__u8		label[BCH_NVMPG_LBL_SIZE];
++	__u32		state;
++	__u32		flags;
++	/*
++	 * Array of offset values from the nvmpg offset format
++	 * pointers, each of the pointer points to a per-namespace
++	 * struct bch_nvmpg_recs.
++	 */
++	__u64		recs_offset[BCH_NVMPG_NS_MAX];
++};
++
++/* heads[0] is always for nvm_pages internal usage */
++struct bch_nvmpg_set_header {
++	union {
++		struct {
++			__u32			size;
++			__u32			used;
++			__u64			_pad[4];
++			struct bch_nvmpg_head	heads[];
++		};
++		__u8				pad[8192];
++	};
++};
++
++#define BCH_NVMPG_MAX_HEADS					\
++	((sizeof(struct bch_nvmpg_set_header) -			\
++	  offsetof(struct bch_nvmpg_set_header, heads)) /	\
++	 sizeof(struct bch_nvmpg_head))
++
++/* The on-media bit order is local CPU order */
++struct bch_nvmpg_sb {
++	__u64			csum;
++	__u64			sb_offset;
++	__u64			ns_start;
++	__u64			version;
++	__u8			magic[16];
++	__u8			uuid[16];
++	__u32			page_size;
++	__u32			total_ns;
++	__u32			this_ns;
++	union {
++		__u8		set_uuid[16];
++		__u64		set_magic;
++	};
++
++	__u64			flags;
++	__u64			seq;
++
++	__u64			feature_compat;
++	__u64			feature_incompat;
++	__u64			feature_ro_compat;
++
++	/* For allocable nvm pages from buddy systems */
++	__u64			pages_offset;
++	__u64			pages_total;
++
++	__u64			pad[8];
++
++	/*
++	 * A nvmpg offset format pointer, it points
++	 * to struct bch_nvmpg_set_header which is
++	 * stored only on the first name space.
++	 */
++	__u64			set_header_offset;
++
++	/* Just for csum_set() */
++	__u32			keys;
++	__u64			d[0];
++};
++
++#endif /* _NVMPG_FORMAT_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch
new file mode 100644
index 0000000..54243a6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch
@@ -0,0 +1,542 @@
+From e75f8de4ca87db06507e173d795f42d1c98468d4 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Mon, 26 Jul 2021 10:33:30 +0800
+Subject: [PATCH v13 02/12] bcache: initialize the nvm pages allocator
+
+This patch define the prototype data structures in memory and
+initializes the nvm pages allocator.
+
+The nvm address space which is managed by this allocator can consist of
+many nvm namespaces, and some namespaces can compose into one nvm set,
+like cache set. For this initial implementation, only one set can be
+supported.
+
+The users of this nvm pages allocator need to call register_namespace()
+to register the nvdimm device (like /dev/pmemX) into this allocator as
+the instance of struct nvm_namespace.
+
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/Kconfig  |  10 ++
+ drivers/md/bcache/Makefile |   1 +
+ drivers/md/bcache/nvmpg.c  | 340 +++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h  |  97 +++++++++++
+ drivers/md/bcache/super.c  |   3 +
+ 5 files changed, 451 insertions(+)
+ create mode 100644 drivers/md/bcache/nvmpg.c
+ create mode 100644 drivers/md/bcache/nvmpg.h
+
+diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
+index cf3e8096942a..4a7c13e882bb 100644
+--- a/drivers/md/bcache/Kconfig
++++ b/drivers/md/bcache/Kconfig
+@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION
+ 	device path into this file will returns immediately and the real
+ 	registration work is handled in kernel work queue in asynchronous
+ 	way.
++
++config BCACHE_NVM_PAGES
++	bool "NVDIMM support for bcache (EXPERIMENTAL)"
++	depends on BCACHE
++	depends on 64BIT
++	depends on LIBNVDIMM
++	depends on DAX
++	help
++	  Allocate/release NV-memory pages for bcache and provide allocated pages
++	  for each requestor after system reboot.
+diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
+index 5b87e59676b8..276b33be5ad5 100644
+--- a/drivers/md/bcache/Makefile
++++ b/drivers/md/bcache/Makefile
+@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
+ bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+ 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ 	util.o writeback.o features.o
++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+new file mode 100644
+index 000000000000..b654bbbda03e
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.c
+@@ -0,0 +1,340 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Nvdimm page-buddy allocator
++ *
++ * Copyright (c) 2021, Intel Corporation.
++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
++ */
++
++#include "bcache.h"
++#include "nvmpg.h"
++
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
++#include <linux/libnvdimm.h>
++#include <linux/mm_types.h>
++#include <linux/err.h>
++#include <linux/pagemap.h>
++#include <linux/bitmap.h>
++#include <linux/blkdev.h>
++
++struct bch_nvmpg_set *global_nvmpg_set;
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset)
++{
++	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (offset == 0)
++		return NULL;
++
++	ns_id = BCH_NVMPG_GET_NS_ID(offset);
++	ns = global_nvmpg_set->ns_tbl[ns_id];
++
++	if (ns)
++		return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
++
++	pr_err("Invalid ns_id %u\n", ns_id);
++	return NULL;
++}
++
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = (unsigned long)(ptr - ns->base_addr);
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
++static void release_ns_tbl(struct bch_nvmpg_set *set)
++{
++	int i;
++	struct bch_nvmpg_ns *ns;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		ns = set->ns_tbl[i];
++		if (ns) {
++			fs_put_dax(ns->dax_dev);
++			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++			set->ns_tbl[i] = NULL;
++			set->attached_ns--;
++			kfree(ns);
++		}
++	}
++
++	if (set->attached_ns)
++		pr_err("unexpected attached_ns: %u\n", set->attached_ns);
++}
++
++static void release_nvmpg_set(struct bch_nvmpg_set *set)
++{
++	release_ns_tbl(set);
++	kfree(set);
++}
++
++/* Namespace 0 contains all meta data of the nvmpg allocation set */
++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_set_header *set_header;
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
++		       ns->ns_id);
++		return -EINVAL;
++	}
++
++	set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
++
++	mutex_lock(&global_nvmpg_set->lock);
++	global_nvmpg_set->set_header = set_header;
++	global_nvmpg_set->heads_size = set_header->size;
++	global_nvmpg_set->heads_used = set_header->used;
++	mutex_unlock(&global_nvmpg_set->lock);
++
++	return 0;
++}
++
++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
++{
++	struct bch_nvmpg_sb *sb = ns->sb;
++	int rc = 0;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
++		pr_err("ns_id %u already attached.\n", ns->ns_id);
++		rc = -EEXIST;
++		goto unlock;
++	}
++
++	if (ns->ns_id != 0) {
++		pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	if (global_nvmpg_set->attached_ns > 0) {
++		pr_err("multiple namespace attaching not supported yet\n");
++		rc = -EOPNOTSUPP;
++		goto unlock;
++	}
++
++	if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
++		pr_err("namespace counters error: attached %u > total %u\n",
++		       global_nvmpg_set->attached_ns,
++		       global_nvmpg_set->total_ns);
++		rc = -EINVAL;
++		goto unlock;
++	}
++
++	memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
++	global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
++	global_nvmpg_set->attached_ns++;
++	global_nvmpg_set->total_ns = sb->total_ns;
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
++
++static int read_nvdimm_meta_super(struct block_device *bdev,
++				  struct bch_nvmpg_ns *ns)
++{
++	struct page *page;
++	struct bch_nvmpg_sb *sb;
++	uint64_t expected_csum = 0;
++	int r;
++
++	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
++				BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
++
++	if (IS_ERR(page))
++		return -EIO;
++
++	sb = (struct bch_nvmpg_sb *)
++	     (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
++
++	r = -EINVAL;
++	expected_csum = csum_set(sb);
++	if (expected_csum != sb->csum) {
++		pr_info("csum is not match with expected one\n");
++		goto put_page;
++	}
++
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_info("invalid bch_nvmpg_magic\n");
++		goto put_page;
++	}
++
++	if (sb->sb_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
++		pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
++		goto put_page;
++	}
++
++	r = -EOPNOTSUPP;
++	if (sb->total_ns != 1) {
++		pr_info("multiple name space not supported yet.\n");
++		goto put_page;
++	}
++
++
++	r = 0;
++	/* Necessary for DAX mapping */
++	ns->page_size = sb->page_size;
++	ns->pages_total = sb->pages_total;
++
++put_page:
++	put_page(page);
++	return r;
++}
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	struct bch_nvmpg_ns *ns = NULL;
++	struct bch_nvmpg_sb *sb = NULL;
++	char buf[BDEVNAME_SIZE];
++	struct block_device *bdev;
++	pgoff_t pgoff;
++	int id, err;
++	char *path;
++	long dax_ret = 0;
++
++	path = kstrndup(dev_path, 512, GFP_KERNEL);
++	if (!path) {
++		pr_err("kstrndup failed\n");
++		return ERR_PTR(-ENOMEM);
++	}
++
++	bdev = blkdev_get_by_path(strim(path),
++				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
++				  global_nvmpg_set);
++	if (IS_ERR(bdev)) {
++		pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
++		kfree(path);
++		return ERR_PTR(PTR_ERR(bdev));
++	}
++
++	err = -ENOMEM;
++	ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
++	if (!ns)
++		goto bdput;
++
++	err = -EIO;
++	if (read_nvdimm_meta_super(bdev, ns)) {
++		pr_err("%s read nvdimm meta super block failed.\n",
++		       bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EOPNOTSUPP;
++	ns->dax_dev = fs_dax_get_by_bdev(bdev);
++	if (!ns->dax_dev) {
++		pr_err("can't get dax device by %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0,
++			   bdev_nr_sectors(bdev))) {
++		pr_err("%s don't support DAX\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
++		pr_err("invalid offset of %s\n", bdevname(bdev, buf));
++		goto free_ns;
++	}
++
++	err = -EINVAL;
++	id = dax_read_lock();
++	dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
++				    &ns->base_addr, &ns->start_pfn);
++	if (dax_ret <= 0) {
++		pr_err("dax_direct_access error\n");
++		dax_read_unlock(id);
++		goto free_ns;
++	}
++
++	if (dax_ret < ns->pages_total) {
++		pr_warn("currently first %ld pages (from %lu in total) are used\n",
++			dax_ret, ns->pages_total);
++	}
++	dax_read_unlock(id);
++
++	sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
++
++	err = -EINVAL;
++	/* Check magic again to make sure DAX mapping is correct */
++	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
++		pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
++		goto free_ns;
++	}
++
++	if ((global_nvmpg_set->attached_ns > 0) &&
++	     memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
++		pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
++		goto free_ns;
++	}
++
++	if (sb->set_header_offset !=
++	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
++		pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
++		       sb->this_ns,
++		       BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
++		       BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
++		goto free_ns;
++	}
++
++	ns->page_size = sb->page_size;
++	ns->pages_offset = sb->pages_offset;
++	ns->pages_total = sb->pages_total;
++	ns->sb = sb;
++	ns->free = 0;
++	ns->bdev = bdev;
++	ns->set = global_nvmpg_set;
++
++	err = attach_nvmpg_set(ns);
++	if (err < 0)
++		goto free_ns;
++
++	mutex_init(&ns->lock);
++
++	err = init_nvmpg_set_header(ns);
++	if (err < 0)
++		goto free_ns;
++
++	kfree(path);
++	return ns;
++
++free_ns:
++	fs_put_dax(ns->dax_dev);
++	kfree(ns);
++bdput:
++	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
++	kfree(path);
++	return ERR_PTR(err);
++}
++
++int __init bch_nvmpg_init(void)
++{
++	global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
++	if (!global_nvmpg_set)
++		return -ENOMEM;
++
++	global_nvmpg_set->total_ns = 0;
++	mutex_init(&global_nvmpg_set->lock);
++
++	pr_info("bcache nvm init\n");
++	return 0;
++}
++
++void bch_nvmpg_exit(void)
++{
++	release_nvmpg_set(global_nvmpg_set);
++	pr_info("bcache nvm exit\n");
++}
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+new file mode 100644
+index 000000000000..698c890b2d15
+--- /dev/null
++++ b/drivers/md/bcache/nvmpg.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _BCACHE_NVM_PAGES_H
++#define _BCACHE_NVM_PAGES_H
++
++#include <linux/libnvdimm.h>
++
++#include "nvmpg_format.h"
++
++/*
++ * Bcache NVDIMM in memory data structures
++ */
++
++/*
++ * The following three structures in memory records which page(s) allocated
++ * to which owner. After reboot from power failure, they will be initialized
++ * based on nvm pages superblock in NVDIMM device.
++ */
++struct bch_nvmpg_ns {
++	struct bch_nvmpg_sb *sb;
++	void *base_addr;
++
++	unsigned char uuid[16];
++	int ns_id;
++	unsigned int page_size;
++	unsigned long free;
++	unsigned long pages_offset;
++	unsigned long pages_total;
++	pfn_t start_pfn;
++
++	struct dax_device *dax_dev;
++	struct block_device *bdev;
++	struct bch_nvmpg_set *set;
++
++	struct mutex lock;
++};
++
++/*
++ * A set of namespaces. Currently only one set can be supported.
++ */
++struct bch_nvmpg_set {
++	unsigned char set_uuid[16];
++
++	int heads_size;
++	int heads_used;
++	struct bch_nvmpg_set_header *set_header;
++
++	struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
++	int total_ns;
++	int attached_ns;
++
++	struct mutex lock;
++};
++
++#define BCH_NVMPG_NS_ID_BITS	3
++#define BCH_NVMPG_OFFSET_BITS	61
++#define BCH_NVMPG_NS_ID_MASK	((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
++#define BCH_NVMPG_OFFSET_MASK	((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
++
++#define BCH_NVMPG_GET_NS_ID(offset)					\
++	(((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
++
++#define BCH_NVMPG_GET_OFFSET(offset)	((offset) & BCH_NVMPG_OFFSET_MASK)
++
++#define BCH_NVMPG_OFFSET(ns_id, offset)					\
++	((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) |	\
++	 ((offset) & BCH_NVMPG_OFFSET_MASK))
++
++/* Indicate which field in bch_nvmpg_sb to be updated */
++#define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
++
++void *bch_nvmpg_offset_to_ptr(unsigned long offset);
++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
++int bch_nvmpg_init(void);
++void bch_nvmpg_exit(void);
++
++#else
++
++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
++{
++	return NULL;
++}
++
++static inline int bch_nvmpg_init(void)
++{
++	return 0;
++}
++
++static inline void bch_nvmpg_exit(void) { }
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++#endif /* _BCACHE_NVM_PAGES_H */
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 86b9e355c583..74d51a0b806f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -14,6 +14,7 @@
+ #include "request.h"
+ #include "writeback.h"
+ #include "features.h"
++#include "nvmpg.h"
+ 
+ #include <linux/blkdev.h>
+ #include <linux/pagemap.h>
+@@ -2818,6 +2819,7 @@ static void bcache_exit(void)
+ {
+ 	bch_debug_exit();
+ 	bch_request_exit();
++	bch_nvmpg_exit();
+ 	if (bcache_kobj)
+ 		kobject_put(bcache_kobj);
+ 	if (bcache_wq)
+@@ -2916,6 +2918,7 @@ static int __init bcache_init(void)
+ 
+ 	bch_debug_init();
+ 	closure_debug_init();
++	bch_nvmpg_init();
+ 
+ 	bcache_is_reboot = false;
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch
new file mode 100644
index 0000000..9adcb46
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch
@@ -0,0 +1,359 @@
+From ef9ee14f2d7b1dd38f8aebf190e9ed1527f688c2 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:45:57 +0800
+Subject: [PATCH v13 03/12] bcache: initialization of the buddy
+
+This nvm pages allocator will implement the simple buddy allocator to
+anage the nvm address space. This patch initializes this buddy allocator
+for new namespace.
+
+the unit of alloc/free of the buddy allocator is page. DAX device has
+their struct page(in dram or PMEM).
+
+	struct {        /* ZONE_DEVICE pages */
+		/** @pgmap: Points to the hosting device page map. */
+		struct dev_pagemap *pgmap;
+		void *zone_device_data;
+		/*
+		 * ZONE_DEVICE private pages are counted as being
+		 * mapped so the next 3 words hold the mapping, index,
+		 * and private fields from the source anonymous or
+		 * page cache page while the page is migrated to device
+		 * private memory.
+		 * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+		 * use the mapping, index, and private fields when
+		 * pmem backed DAX files are mapped.
+		 */
+	};
+
+ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
+So the second/third word will be used as 'struct list_head ' which list
+in buddy. The fourth word(that is normal struct page::index) store pgoff
+which the page-offset in the dax device. And the fifth word (that is
+normal struct page::private) store order of buddy. page_type will be used
+to store buddy flags.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++-
+ drivers/md/bcache/nvmpg.h |  12 +++
+ 2 files changed, 221 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index b654bbbda03e..2b70ee4a6028 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ 	return BCH_NVMPG_OFFSET(ns_id, offset);
+ }
+ 
++static struct page *bch_nvmpg_va_to_pg(void *addr)
++{
++	return virt_to_page(addr);
++}
++
++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
++{
++	return ns->base_addr + (pgoff << PAGE_SHIFT);
++}
++
++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
++{
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
++	pgoff_t pgoff = r->pgoff;
++
++	return bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++}
++
++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns,
++				       pgoff_t pgoff, u64 nr)
++{
++	while (nr > 0) {
++		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
++
++		bitmap_set(ns->pages_bitmap, pgoff, num);
++		nr -= num;
++		pgoff += num;
++	}
++}
++
+ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ {
+ 	int i;
+@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set)
+ 	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+ 		ns = set->ns_tbl[i];
+ 		if (ns) {
++			kvfree(ns->pages_bitmap);
++			if (ns->recs_bitmap)
++				bitmap_free(ns->recs_bitmap);
++
+ 			fs_put_dax(ns->dax_dev);
+ 			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ 			set->ns_tbl[i] = NULL;
+@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set)
+ 	kfree(set);
+ }
+ 
++static int validate_recs(int ns_id,
++			 struct bch_nvmpg_head *head,
++			 struct bch_nvmpg_recs *recs)
++{
++	if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) {
++		pr_err("Invalid bch_nvmpg_recs magic\n");
++		return -EINVAL;
++	}
++
++	if (memcmp(recs->uuid, head->uuid, 16)) {
++		pr_err("Invalid bch_nvmpg_recs uuid\n");
++		return -EINVAL;
++	}
++
++	if (recs->head_offset !=
++	    bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) {
++		pr_err("Invalid recs head_offset\n");
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs)
++{
++	int i, used = 0;
++
++	for (i = 0; i < recs->size; i++) {
++		struct bch_nvmpg_rec *r = &recs->recs[i];
++		struct bch_nvmpg_ns *ns;
++		struct page *page;
++		void *addr;
++
++		if (r->pgoff == 0)
++			continue;
++
++		ns = global_nvmpg_set->ns_tbl[r->ns_id];
++		addr = bch_nvmpg_rec_to_ptr(r);
++		if (addr < ns->base_addr) {
++			pr_err("Invalid recorded address\n");
++			return -EINVAL;
++		}
++
++		/* init struct page: index/private */
++		page = bch_nvmpg_va_to_pg(addr);
++		set_page_private(page, r->order);
++		page->index = r->pgoff;
++
++		reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order);
++		used++;
++	}
++
++	if (used != recs->used) {
++		pr_err("used %d doesn't match recs->used %d\n",
++		       used, recs->used);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
+ /* Namespace 0 contains all meta data of the nvmpg allocation set */
+ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_set_header *set_header;
++	struct bch_nvmpg_recs *sys_recs;
++	int i, j, used = 0, rc = 0;
+ 
+ 	if (ns->ns_id != 0) {
+ 		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	global_nvmpg_set->set_header = set_header;
+ 	global_nvmpg_set->heads_size = set_header->size;
+ 	global_nvmpg_set->heads_used = set_header->used;
++
++	/* Reserve the used space from buddy allocator */
++	reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size));
++
++	sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET;
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *head;
++
++		head = &set_header->heads[i];
++		if (head->state == BCH_NVMPG_HD_STAT_FREE)
++			continue;
++
++		used++;
++		if (used > global_nvmpg_set->heads_size) {
++			pr_err("used heads %d > heads size %d.\n",
++			       used, global_nvmpg_set->heads_size);
++			goto unlock;
++		}
++
++		for (j = 0; j < BCH_NVMPG_NS_MAX; j++) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]);
++
++			/* Iterate the recs list */
++			while (recs) {
++				rc = validate_recs(j, head, recs);
++				if (rc < 0)
++					goto unlock;
++
++				rc = reserve_nvmpg_recs(recs);
++				if (rc < 0)
++					goto unlock;
++
++				bitmap_set(ns->recs_bitmap, recs - sys_recs, 1);
++				recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++			}
++		}
++	}
++unlock:
+ 	mutex_unlock(&global_nvmpg_set->lock);
++	return rc;
++}
+ 
+-	return 0;
++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
++{
++	unsigned int start, end, pages;
++	int i;
++	struct page *page;
++	pgoff_t pgoff_start;
++
++	bitmap_for_each_clear_region(ns->pages_bitmap,
++				     start, end, 0, ns->pages_total) {
++		pgoff_start = start;
++		pages = end - start;
++
++		while (pages) {
++			void *addr;
++
++			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
++				if ((pgoff_start % (1L << i) == 0) &&
++				    (pages >= (1L << i)))
++					break;
++			}
++
++			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
++			page = bch_nvmpg_va_to_pg(addr);
++			set_page_private(page, i);
++			page->index = pgoff_start;
++			__SetPageBuddy(page);
++			list_add((struct list_head *)&page->zone_device_data,
++				 &ns->free_area[i]);
++
++			pgoff_start += 1L << i;
++			pages -= 1L << i;
++		}
++	}
+ }
+ 
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	char buf[BDEVNAME_SIZE];
+ 	struct block_device *bdev;
+ 	pgoff_t pgoff;
+-	int id, err;
++	int id, i, err;
+ 	char *path;
+ 	long dax_ret = 0;
+ 
+@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 
+ 	mutex_init(&ns->lock);
+ 
++	/*
++	 * parameters of bitmap_set/clear are unsigned int.
++	 * Given currently size of nvm is far from exceeding this limit,
++	 * so only add a WARN_ON message.
++	 */
++	WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
++	ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
++				    sizeof(unsigned long), GFP_KERNEL);
++	if (!ns->pages_bitmap) {
++		err = -ENOMEM;
++		goto clear_ns_nr;
++	}
++
++	if (ns->sb->this_ns == 0) {
++		ns->recs_bitmap =
++			bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
++		if (ns->recs_bitmap == NULL) {
++			err = -ENOMEM;
++			goto free_pages_bitmap;
++		}
++	}
++
++	for (i = 0; i < BCH_MAX_ORDER; i++)
++		INIT_LIST_HEAD(&ns->free_area[i]);
++
+ 	err = init_nvmpg_set_header(ns);
+ 	if (err < 0)
+-		goto free_ns;
++		goto free_recs_bitmap;
++
++	if (ns->sb->this_ns == 0)
++		/* init buddy allocator */
++		bch_nvmpg_init_free_space(ns);
+ 
+ 	kfree(path);
+ 	return ns;
+ 
++free_recs_bitmap:
++	bitmap_free(ns->recs_bitmap);
++free_pages_bitmap:
++	kvfree(ns->pages_bitmap);
++clear_ns_nr:
++	global_nvmpg_set->ns_tbl[sb->this_ns] = NULL;
+ free_ns:
+ 	fs_put_dax(ns->dax_dev);
+ 	kfree(ns);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 698c890b2d15..55778d4db7da 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -11,6 +11,8 @@
+  * Bcache NVDIMM in memory data structures
+  */
+ 
++#define BCH_MAX_ORDER 20
++
+ /*
+  * The following three structures in memory records which page(s) allocated
+  * to which owner. After reboot from power failure, they will be initialized
+@@ -28,6 +30,11 @@ struct bch_nvmpg_ns {
+ 	unsigned long pages_total;
+ 	pfn_t start_pfn;
+ 
++	unsigned long *pages_bitmap;
++	struct list_head free_area[BCH_MAX_ORDER];
++
++	unsigned long *recs_bitmap;
++
+ 	struct dax_device *dax_dev;
+ 	struct block_device *bdev;
+ 	struct bch_nvmpg_set *set;
+@@ -69,6 +76,11 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_MAX_PGALLOC_RECS						\
++	(min_t(unsigned int, 64,					\
++	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
++	       sizeof(struct bch_nvmpg_recs)))
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
new file mode 100644
index 0000000..ef13f6e
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch
@@ -0,0 +1,308 @@
+From b09e24d84a7ae11be4bd7255648ebd5006678029 Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Wed, 4 Aug 2021 22:41:20 +0800
+Subject: [PATCH v13 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy
+
+This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy
+allocator. In terms of function, this func is like current
+page-buddy-alloc. But the differences are:
+a: it need owner_uuid as parameter which record owner info. And it
+make those info persistence.
+b: it don't need flags like GFP_*. All allocs are the equal.
+c: it don't trigger other ops etc swap/recycle.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/nvmpg.h |   9 ++
+ 2 files changed, 230 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 2b70ee4a6028..a920779eb548 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ 	return NULL;
+ }
+ 
++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset)
++{
++	return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT;
++}
++
+ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+ {
+ 	int ns_id = ns->ns_id;
+@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff)
+ 	return ns->base_addr + (pgoff << PAGE_SHIFT);
+ }
+ 
++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns,
++					       pgoff_t pgoff)
++{
++	int ns_id = ns->ns_id;
++	unsigned long offset = pgoff << PAGE_SHIFT;
++
++	return BCH_NVMPG_OFFSET(ns_id, offset);
++}
++
+ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r)
+ {
+ 	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id];
+@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 	}
+ }
+ 
++
++/* If not found, it will create if create == true */
++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create)
++{
++	struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header;
++	struct bch_nvmpg_head *head = NULL;
++	int i;
++
++	if (set_header == NULL)
++		goto out;
++
++	for (i = 0; i < set_header->size; i++) {
++		struct bch_nvmpg_head *h = &set_header->heads[i];
++
++		if (h->state != BCH_NVMPG_HD_STAT_ALLOC)
++			continue;
++
++		if (!memcmp(uuid, h->uuid, 16)) {
++			head = h;
++			break;
++		}
++	}
++
++	if (!head && create) {
++		u32 used = set_header->used;
++
++		if (set_header->size > used) {
++			head = &set_header->heads[used];
++			memset(head, 0, sizeof(struct bch_nvmpg_head));
++			head->state = BCH_NVMPG_HD_STAT_ALLOC;
++			memcpy(head->uuid, uuid, 16);
++			global_nvmpg_set->heads_used++;
++			set_header->used++;
++		} else
++			pr_info("No free bch_nvmpg_head\n");
++	}
++
++out:
++	return head;
++}
++
++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void)
++{
++	unsigned int start;
++	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0];
++	struct bch_nvmpg_recs *recs;
++
++	start = bitmap_find_next_zero_area(ns->recs_bitmap,
++					   BCH_MAX_PGALLOC_RECS, 0, 1, 0);
++	if (start > BCH_MAX_PGALLOC_RECS) {
++		pr_info("No free struct bch_nvmpg_recs\n");
++		return NULL;
++	}
++
++	bitmap_set(ns->recs_bitmap, start, 1);
++	recs = (struct bch_nvmpg_recs *)
++		bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET)
++	       + start;
++
++	memset(recs, 0, sizeof(struct bch_nvmpg_recs));
++	return recs;
++}
++
++
++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns,
++					      struct bch_nvmpg_head *head,
++					      bool create)
++{
++	int ns_id = ns->sb->this_ns;
++	struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL;
++
++	recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]);
++
++	/* If create=false, we return recs[nr] */
++	if (!create)
++		return recs;
++
++	/*
++	 * If create=true, it mean we need a empty struct bch_nvmpg_rec
++	 * So we should find non-empty struct bch_nvmpg_recs or alloc
++	 * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs
++	 */
++	while (recs && (recs->used == recs->size)) {
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++
++	/* Found empty struct bch_nvmpg_recs */
++	if (recs)
++		return recs;
++
++	/* Need alloc new struct bch_nvmpg_recs */
++	recs = find_empty_nvmpg_recs();
++	if (recs) {
++		unsigned long offset;
++
++		recs->next_offset = 0;
++		recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head);
++		memcpy(recs->magic, bch_nvmpg_recs_magic, 16);
++		memcpy(recs->uuid, head->uuid, 16);
++		recs->size = BCH_NVMPG_MAX_RECS;
++		recs->used = 0;
++
++		offset = bch_nvmpg_ptr_to_offset(ns, recs);
++		if (prev_recs)
++			prev_recs->next_offset = offset;
++		else
++			head->recs_offset[ns_id] = offset;
++	}
++
++	return recs;
++}
++
++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns,
++			  struct bch_nvmpg_recs *recs,
++			  unsigned long nvmpg_offset,
++			  int order)
++{
++	int i, ns_id;
++	unsigned long pgoff;
++
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++	ns_id = ns->sb->this_ns;
++
++	for (i = 0; i < recs->size; i++) {
++		if (recs->recs[i].pgoff == 0) {
++			recs->recs[i].pgoff = pgoff;
++			recs->recs[i].order = order;
++			recs->recs[i].ns_id = ns_id;
++			recs->used++;
++			break;
++		}
++	}
++	BUG_ON(i == recs->size);
++}
++
++
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	unsigned long nvmpg_offset = 0;
++	struct bch_nvmpg_head *head;
++	int n, o;
++
++	mutex_lock(&global_nvmpg_set->lock);
++	head = find_nvmpg_head(uuid, true);
++
++	if (!head) {
++		pr_err("Cannot find bch_nvmpg_recs by uuid.\n");
++		goto unlock;
++	}
++
++	for (n = 0; n < global_nvmpg_set->total_ns; n++) {
++		struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n];
++
++		if (!ns || (ns->free < (1L << order)))
++			continue;
++
++		for (o = order; o < BCH_MAX_ORDER; o++) {
++			struct list_head *list;
++			struct page *page, *buddy_page;
++
++			if (list_empty(&ns->free_area[o]))
++				continue;
++
++			list = ns->free_area[o].next;
++			page = container_of((void *)list, struct page,
++					    zone_device_data);
++
++			list_del(list);
++
++			while (o != order) {
++				void *addr;
++				pgoff_t pgoff;
++
++				pgoff = page->index + (1L << (o - 1));
++				addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++				buddy_page = bch_nvmpg_va_to_pg(addr);
++				set_page_private(buddy_page, o - 1);
++				buddy_page->index = pgoff;
++				__SetPageBuddy(buddy_page);
++				list_add((struct list_head *)&buddy_page->zone_device_data,
++					 &ns->free_area[o - 1]);
++				o--;
++			}
++
++			set_page_private(page, order);
++			__ClearPageBuddy(page);
++			ns->free -= 1L << order;
++			nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index);
++			break;
++		}
++
++		if (o < BCH_MAX_ORDER) {
++			struct bch_nvmpg_recs *recs;
++
++			recs = find_nvmpg_recs(ns, head, true);
++			/* ToDo: handle pgalloc_recs==NULL */
++			add_nvmpg_rec(ns, recs, nvmpg_offset, order);
++			break;
++		}
++	}
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++	return nvmpg_offset;
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 55778d4db7da..d03f3241b45a 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -76,6 +76,9 @@ struct bch_nvmpg_set {
+ /* Indicate which field in bch_nvmpg_sb to be updated */
+ #define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+ 
++#define BCH_PGOFF_TO_KVADDR(pgoff)					\
++	((void *)((unsigned long)(pgoff) << PAGE_SHIFT))
++
+ #define BCH_MAX_PGALLOC_RECS						\
+ 	(min_t(unsigned int, 64,					\
+ 	       (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) /		\
+@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ 
+ #else
+ 
+@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void)
+ 
+ static inline void bch_nvmpg_exit(void) { }
+ 
++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
++{
++	return 0;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch
new file mode 100644
index 0000000..fd631ae
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch
@@ -0,0 +1,252 @@
+From 1f1fd2517b0a3520ab3a78cabe737cfb1f628d2e Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 19:06:35 +0800
+Subject: [PATCH v13 05/12] bcache: bch_nvmpg_free_pages() of the buddy
+ allocator
+
+This patch implements the bch_nvmpg_free_pages() of the buddy allocator.
+
+The difference between this and page-buddy-free:
+it need owner_uuid to free owner allocated pages, and must
+persistent after free.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++--
+ drivers/md/bcache/nvmpg.h |   3 +
+ 2 files changed, 160 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index a920779eb548..8ce0c4389b42 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+ 	return rc;
+ }
+ 
++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset,
++			 int order)
++{
++	unsigned long add_pages = (1L << order);
++	pgoff_t pgoff;
++	struct page *page;
++	void *va;
++
++	if (nvmpg_offset == 0) {
++		pr_err("free pages on offset 0\n");
++		return;
++	}
++
++	page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset));
++	WARN_ON((!page) || (page->private != order));
++	pgoff = page->index;
++
++	while (order < BCH_MAX_ORDER - 1) {
++		struct page *buddy_page;
++
++		pgoff_t buddy_pgoff = pgoff ^ (1L << order);
++		pgoff_t parent_pgoff = pgoff & ~(1L << order);
++
++		if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total))
++			break;
++
++		va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff);
++		buddy_page = bch_nvmpg_va_to_pg(va);
++		WARN_ON(!buddy_page);
++
++		if (PageBuddy(buddy_page) && (buddy_page->private == order)) {
++			list_del((struct list_head *)&buddy_page->zone_device_data);
++			__ClearPageBuddy(buddy_page);
++			pgoff = parent_pgoff;
++			order++;
++			continue;
++		}
++		break;
++	}
++
++	va = bch_nvmpg_pgoff_to_ptr(ns, pgoff);
++	page = bch_nvmpg_va_to_pg(va);
++	WARN_ON(!page);
++	list_add((struct list_head *)&page->zone_device_data,
++		 &ns->free_area[order]);
++	page->index = pgoff;
++	set_page_private(page, order);
++	__SetPageBuddy(page);
++	ns->free += add_pages;
++}
++
+ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ {
+ 	unsigned int start, end, pages;
+@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns)
+ 		pages = end - start;
+ 
+ 		while (pages) {
+-			void *addr;
+-
+ 			for (i = BCH_MAX_ORDER - 1; i >= 0; i--) {
+ 				if ((pgoff_start % (1L << i) == 0) &&
+ 				    (pages >= (1L << i)))
+ 					break;
+ 			}
+ 
+-			addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start);
+-			page = bch_nvmpg_va_to_pg(addr);
++			page = bch_nvmpg_va_to_pg(
++					bch_nvmpg_pgoff_to_ptr(ns, pgoff_start));
+ 			set_page_private(page, i);
+ 			page->index = pgoff_start;
+-			__SetPageBuddy(page);
+-			list_add((struct list_head *)&page->zone_device_data,
+-				 &ns->free_area[i]);
++
++			/* In order to update ns->free */
++			__free_space(ns, pgoff_start, i);
+ 
+ 			pgoff_start += 1L << i;
+ 			pages -= 1L << i;
+@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return nvmpg_offset;
+ }
+ 
++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
++{
++	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
++}
++
++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns,
++				     void *start_addr, void *end_addr)
++{
++	return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns));
++}
++
++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id,
++			    unsigned long nvmpg_offset, int order)
++{
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *prev_recs, *sys_recs;
++	struct bch_nvmpg_ns *ns;
++	unsigned long pgoff;
++	int i;
++
++	ns = global_nvmpg_set->ns_tbl[0];
++	pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset);
++
++	head = bch_nvmpg_offset_to_ptr(recs->head_offset);
++	prev_recs = recs;
++	sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET);
++	while (recs) {
++		for (i = 0; i < recs->size; i++) {
++			struct bch_nvmpg_rec *rec = &(recs->recs[i]);
++
++			if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) {
++				WARN_ON(rec->order != order);
++				rec->_v = 0;
++				recs->used--;
++
++				if (recs->used == 0) {
++					int recs_pos = recs - sys_recs;
++
++					if (recs == prev_recs)
++						head->recs_offset[ns_id] =
++							recs->next_offset;
++					else
++						prev_recs->next_offset =
++							recs->next_offset;
++
++					recs->next_offset = 0;
++					recs->head_offset = 0;
++
++					bitmap_clear(ns->recs_bitmap, recs_pos, 1);
++				}
++				goto out;
++			}
++		}
++		prev_recs = recs;
++		recs = bch_nvmpg_offset_to_ptr(recs->next_offset);
++	}
++out:
++	return (recs ? 0 : -ENOENT);
++}
++
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order,
++			  const char *uuid)
++{
++	struct bch_nvmpg_ns *ns;
++	struct bch_nvmpg_head *head;
++	struct bch_nvmpg_recs *recs;
++	int r;
++
++	mutex_lock(&global_nvmpg_set->lock);
++
++	ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)];
++	if (!ns) {
++		pr_err("can't find namespace by given kaddr from namespace\n");
++		goto unlock;
++	}
++
++	head = find_nvmpg_head(uuid, false);
++	if (!head) {
++		pr_err("can't found bch_nvmpg_head by uuid\n");
++		goto unlock;
++	}
++
++	recs = find_nvmpg_recs(ns, head, false);
++	if (!recs) {
++		pr_err("can't find bch_nvmpg_recs by uuid\n");
++		goto unlock;
++	}
++
++	r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order);
++	if (r < 0) {
++		pr_err("can't find bch_nvmpg_rec\n");
++		goto unlock;
++	}
++
++	__free_space(ns, nvmpg_offset, order);
++
++unlock:
++	mutex_unlock(&global_nvmpg_set->lock);
++}
++
+ static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+ {
+ 	struct bch_nvmpg_sb *sb = ns->sb;
+@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+ 	ns->pages_offset = sb->pages_offset;
+ 	ns->pages_total = sb->pages_total;
+ 	ns->sb = sb;
++	/* increase by __free_space() */
+ 	ns->free = 0;
+ 	ns->bdev = bdev;
+ 	ns->set = global_nvmpg_set;
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index d03f3241b45a..e089936e7f13 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ 
+ #else
+ 
+@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return 0;
+ }
+ 
++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch
new file mode 100644
index 0000000..f055b17
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch
@@ -0,0 +1,66 @@
+From 953f817e496a1a74b9a8403800bf1d7f0f5b4aeb Mon Sep 17 00:00:00 2001
+From: Jianpeng Ma <jianpeng.ma@intel.com>
+Date: Thu, 21 Oct 2021 21:06:03 +0800
+Subject: [PATCH v13 06/12] bcache: get recs list head for allocated pages by
+ specific uuid
+
+This patch implements bch_get_nvmpg_head() of the buddy allocator
+to be used to get recs list head for allocated pages by specific
+uuid. Then the requester (owner) can find all previous allocated
+nvdimm pages by iterating the recs list.
+
+Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
+Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/md/bcache/nvmpg.c | 5 +++++
+ drivers/md/bcache/nvmpg.h | 6 ++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index 8ce0c4389b42..e26c7b578a62 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 	return nvmpg_offset;
+ }
+ 
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return find_nvmpg_head(uuid, false);
++}
++
+ static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns)
+ {
+ 	return ns->base_addr + (ns->pages_total << PAGE_SHIFT);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index e089936e7f13..2361cabf18be 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -94,6 +94,7 @@ int bch_nvmpg_init(void);
+ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
+ 
+ #else
+ 
+@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid)
+ 
+ static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { }
+ 
++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid)
++{
++	return NULL;
++}
++
+ #endif /* CONFIG_BCACHE_NVM_PAGES */
+ 
+ #endif /* _BCACHE_NVM_PAGES_H */
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch
new file mode 100644
index 0000000..4ae5f06
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch
@@ -0,0 +1,48 @@
+From 566cc2016c7e817b8306db96d97c3e4cdbc254df Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:17:02 +0800
+Subject: [PATCH v13 07/12] bcache: use bucket index to set GC_MARK_METADATA
+ for journal buckets in bch_btree_gc_finish()
+
+Currently the meta data bucket locations on cache device are reserved
+after the meta data stored on NVDIMM pages, for the meta data layout
+consistentcy temporarily. So these buckets are still marked as meta data
+by SET_GC_MARK() in bch_btree_gc_finish().
+
+When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear
+address of NVDIMM pages and not bucket index anymore. Therefore we
+should avoid to find bucket index from sb.d[], and directly use bucket
+index from ca->sb.first_bucket to (ca->sb.first_bucket +
+ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 88c573eeb598..1a0ff117373f 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c)
+ 	ca = c->cache;
+ 	ca->invalidate_needs_gc = 0;
+ 
+-	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+-		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
++	/* Range [first_bucket, first_bucket + keys) is for journal buckets */
++	for (i = ca->sb.first_bucket;
++	     i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++)
++		SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA);
+ 
+ 	for (k = ca->prio_buckets;
+ 	     k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch
new file mode 100644
index 0000000..1e0fb3b
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch
@@ -0,0 +1,60 @@
+From 5da7b9cfe8c6344a6a4271bf3878d22ba87f4398 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 25 Jun 2021 00:18:31 +0800
+Subject: [PATCH v13 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into
+ incompat feature set
+
+This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the
+incompat feature set. When this bit is set by bcache-tools, it indicates
+bcache meta data should be stored on specific NVDIMM meta device.
+
+The bcache meta data mainly includes journal and btree nodes, when this
+bit is set in incompat feature set, bcache will ask the nvm-pages
+allocator for NVDIMM space to store the meta data.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/features.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
+index 09161b89c63e..fab92678be76 100644
+--- a/drivers/md/bcache/features.h
++++ b/drivers/md/bcache/features.h
+@@ -18,11 +18,19 @@
+ #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET		0x0001
+ /* real bucket size is (1 << bucket_size) */
+ #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE	0x0002
++/* store bcache meta data on nvdimm */
++#define BCH_FEATURE_INCOMPAT_NVDIMM_META		0x0004
+ 
+ #define BCH_FEATURE_COMPAT_SUPP		0
+ #define BCH_FEATURE_RO_COMPAT_SUPP	0
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++#define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
++					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \
++					 BCH_FEATURE_INCOMPAT_NVDIMM_META)
++#else
+ #define BCH_FEATURE_INCOMPAT_SUPP	(BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ 					 BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
++#endif
+ 
+ #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ 		((sb)->feature_compat & (mask))
+@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+ 
+ BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META);
+ 
+ static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+ {
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch
new file mode 100644
index 0000000..3e63f08
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch
@@ -0,0 +1,255 @@
+From 6795c385696ab16a78e7b9cce7310a50a2522af5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Oct 2021 21:39:18 +0800
+Subject: [PATCH v13 09/12] bcache: initialize bcache journal for NVDIMM meta
+ device
+
+The nvm-pages allocator may store and index the NVDIMM pages allocated
+for bcache journal. This patch adds the initialization to store bcache
+journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is
+set by bcache-tools.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space()
+will return the nvmpg_offset of NVDIMM pages for bcache journal,
+- If there is previously allocated space, find it from nvm-pages owner
+  list and return to bch_journal_init().
+- If there is no previously allocated space, require a new NVDIMM range
+  from the nvm-pages allocator, and return it to bch_journal_init().
+
+And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg
+offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to
+iterate all journal buckets.
+
+Later when bcache journaling code stores the journaling jset, the target
+NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used
+to calculate the linear address in memory copy from DRAM pages into
+NVDIMM pages.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h |   2 +-
+ drivers/md/bcache/nvmpg.c   |   9 +++
+ drivers/md/bcache/nvmpg.h   |   1 +
+ drivers/md/bcache/super.c   |  18 +++---
+ 5 files changed, 132 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 61bd79babf7a..d887557c718e 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -9,6 +9,8 @@
+ #include "btree.h"
+ #include "debug.h"
+ #include "extents.h"
++#include "nvmpg.h"
++#include "features.h"
+ 
+ #include <trace/events/bcache.h>
+ 
+@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c)
+ 
+ 	return 0;
+ }
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head,
++					     struct cache *ca)
++{
++	unsigned long jnl_offset, jnl_pgoff, jnl_ns_id;
++	unsigned long ret_offset = 0;
++	int i;
++
++	jnl_offset = (unsigned long)ca->sb.d[0];
++	jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset);
++	jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT;
++
++	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
++		struct bch_nvmpg_recs *recs;
++		struct bch_nvmpg_rec *rec;
++		unsigned long recs_offset = 0;
++		int j;
++
++		recs_offset = nvmpg_head->recs_offset[i];
++		recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		while (recs) {
++			for (j = 0; j < recs->size; j++) {
++				rec = &recs->recs[j];
++				if ((rec->pgoff != jnl_pgoff) ||
++				    (rec->ns_id != jnl_ns_id))
++					continue;
++
++				ret_offset = jnl_offset;
++				goto out;
++			}
++			recs_offset = recs->next_offset;
++			recs = bch_nvmpg_offset_to_ptr(recs_offset);
++		}
++	}
++
++out:
++	return ret_offset;
++}
++
++static unsigned long get_journal_nvmpg_space(struct cache *ca)
++{
++	struct bch_nvmpg_head *head = NULL;
++	unsigned long nvmpg_offset;
++	int order;
++
++	head = bch_get_nvmpg_head(ca->sb.set_uuid);
++	if (head) {
++		nvmpg_offset = find_journal_nvmpg_base(head, ca);
++		if (nvmpg_offset)
++			goto found;
++	}
++
++	order = ilog2((ca->sb.bucket_size *
++		       ca->sb.njournal_buckets) / PAGE_SECTORS);
++	nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid);
++	if (nvmpg_offset)
++		memset(bch_nvmpg_offset_to_ptr(nvmpg_offset),
++		       0, (1 << order) * PAGE_SIZE);
++found:
++	return nvmpg_offset;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static int __bch_journal_nvdimm_init(struct cache *ca)
++{
++	int ret = -1;
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	int i;
++	unsigned long jnl_base = 0;
++
++	jnl_base = get_journal_nvmpg_space(ca);
++	if (!jnl_base) {
++		pr_err("Failed to get journal space from nvdimm\n");
++		goto out;
++	}
++
++	/* Iniialized and reloaded from on-disk super block already */
++	if (ca->sb.d[0] != 0)
++		goto out;
++
++	for (i = 0; i < ca->sb.keys; i++)
++		ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i);
++
++	ret = 0;
++out:
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++	return ret;
++}
++
++
++int bch_journal_init(struct cache_set *c)
++{
++	int i, ret = 0;
++	struct cache *ca = c->cache;
++
++	ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
++			      2, SB_JOURNAL_BUCKETS);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb)) {
++		for (i = 0; i < ca->sb.keys; i++)
++			ca->sb.d[i] = ca->sb.first_bucket + i;
++	} else
++		ret = __bch_journal_nvdimm_init(ca);
++
++	return ret;
++}
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..e3a7fa5a8fda 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list);
+ void bch_journal_meta(struct cache_set *c, struct closure *cl);
+ int bch_journal_read(struct cache_set *c, struct list_head *list);
+ int bch_journal_replay(struct cache_set *c, struct list_head *list);
+-
++int bch_journal_init(struct cache_set *c);
+ void bch_journal_free(struct cache_set *c);
+ int bch_journal_alloc(struct cache_set *c);
+ 
+diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c
+index e26c7b578a62..1a3c6327b091 100644
+--- a/drivers/md/bcache/nvmpg.c
++++ b/drivers/md/bcache/nvmpg.c
+@@ -24,6 +24,15 @@
+ 
+ struct bch_nvmpg_set *global_nvmpg_set;
+ 
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id)
++{
++	if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX))
++		return global_nvmpg_set->ns_tbl[ns_id];
++
++	pr_emerg("Invalid ns_id: %d\n", ns_id);
++	return NULL;
++}
++
+ void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+ {
+ 	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h
+index 2361cabf18be..f7b7177cced3 100644
+--- a/drivers/md/bcache/nvmpg.h
++++ b/drivers/md/bcache/nvmpg.h
+@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void);
+ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid);
+ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid);
+ struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid);
++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id);
+ 
+ #else
+ 
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 74d51a0b806f..a27fa65d8832 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb,  struct block_device *
+ 		goto err;
+ 
+ 	err = "Journal buckets not sequential";
+-	for (i = 0; i < sb->keys; i++)
+-		if (sb->d[i] != sb->first_bucket + i)
+-			goto err;
++	if (!bch_has_feature_nvdimm_meta(sb)) {
++		for (i = 0; i < sb->keys; i++)
++			if (sb->d[i] != sb->first_bucket + i)
++				goto err;
++	}
+ 
+ 	err = "Too many journal buckets";
+ 	if (sb->first_bucket + sb->keys > sb->nbuckets)
+@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c)
+ 		if (bch_journal_replay(c, &journal))
+ 			goto err;
+ 	} else {
+-		unsigned int j;
+-
+ 		pr_notice("invalidating existing data\n");
+-		ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+-					2, SB_JOURNAL_BUCKETS);
+-
+-		for (j = 0; j < ca->sb.keys; j++)
+-			ca->sb.d[j] = ca->sb.first_bucket + j;
++		err = "error initializing journal";
++		if (bch_journal_init(c))
++			goto err;
+ 
+ 		bch_initial_gc_finish(c);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch
new file mode 100644
index 0000000..977fff6
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch
@@ -0,0 +1,231 @@
+From 04919917230c65aa07f65a57a136f7994b017faf Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:45:23 +0800
+Subject: [PATCH v13 10/12] bcache: support storing bcache journal into NVDIMM
+ meta device
+
+This patch implements two methods to store bcache journal to,
+1) __journal_write_unlocked() for block interface device
+   The latency method to compose bio and issue the jset bio to cache
+   device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache
+   device to store the journal jset.
+2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM
+   Use memory interface to access NVDIMM pages and store the jset by
+   memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear
+   address from the NVDIMM pages to store the journal jset.
+
+For legacy configuration without NVDIMM meta device, journal I/O is
+handled by __journal_write_unlocked() with existing code logic. If the
+NVDIMM meta device is used (by bcache-tools), the journal I/O will
+be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM
+pages.
+
+And when NVDIMM meta device is used, sb.d[] stores the linear addresses
+from NVDIMM pages (no more bucket index), in journal_reclaim() the
+journaling location in c->journal.key.ptr[0] should also be updated by
+linear address from NVDIMM pages (no more LBA combined by sectors offset
+and bucket index).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++-----------
+ drivers/md/bcache/super.c   |   3 +-
+ 2 files changed, 85 insertions(+), 38 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index d887557c718e..7d5c5ed18890 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca)
+ 		return;
+ 	}
+ 
++	BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb));
++
+ 	switch (atomic_read(&ja->discard_in_flight)) {
+ 	case DISCARD_IN_FLIGHT:
+ 		return;
+@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c)
+ 		goto out;
+ 
+ 	ja->cur_idx = next;
+-	k->ptr[0] = MAKE_PTR(0,
+-			     bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+-			     ca->sb.nr_this_dev);
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		k->ptr[0] = MAKE_PTR(0,
++			bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
++			ca->sb.nr_this_dev);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr(
++						ca->sb.d[ja->cur_idx]);
++#endif
++
+ 	atomic_long_inc(&c->reclaimed_journal_buckets);
+ 
+ 	bkey_init(k);
+@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl)
+ 	spin_unlock(&c->journal.lock);
+ }
+ 
+-static void journal_write_unlocked(struct closure *cl)
++
++static void __journal_write_unlocked(struct cache_set *c)
+ 	__releases(c->journal.lock)
+ {
+-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+-	struct cache *ca = c->cache;
+-	struct journal_write *w = c->journal.cur;
+ 	struct bkey *k = &c->journal.key;
+-	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+-		ca->sb.block_size;
+-
++	struct journal_write *w = c->journal.cur;
++	struct closure *cl = &c->journal.io;
++	struct cache *ca = c->cache;
+ 	struct bio *bio;
+ 	struct bio_list list;
++	unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
++		ca->sb.block_size;
+ 
+ 	bio_list_init(&list);
+ 
+-	if (!w->need_write) {
+-		closure_return_with_destructor(cl, journal_write_unlock);
+-		return;
+-	} else if (journal_full(&c->journal)) {
+-		journal_reclaim(c);
+-		spin_unlock(&c->journal.lock);
+-
+-		btree_flush_write(c);
+-		continue_at(cl, journal_write, bch_journal_wq);
+-		return;
+-	}
+-
+-	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
+-
+-	w->data->btree_level = c->root->level;
+-
+-	bkey_copy(&w->data->btree_root, &c->root->key);
+-	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+-
+-	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+-	w->data->magic		= jset_magic(&ca->sb);
+-	w->data->version	= BCACHE_JSET_VERSION;
+-	w->data->last_seq	= last_seq(&c->journal);
+-	w->data->csum		= csum_set(w->data);
+-
+ 	for (i = 0; i < KEY_PTRS(k); i++) {
+ 		ca = c->cache;
+ 		bio = &ca->journal.bio;
+@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ 	}
+-
+ 	/* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ 	BUG_ON(i == 0);
+ 
+@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 	while ((bio = bio_list_pop(&list)))
+ 		closure_bio_submit(c, bio, cl);
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static void __journal_nvdimm_write_unlocked(struct cache_set *c)
++	__releases(c->journal.lock)
++{
++	struct journal_write *w = c->journal.cur;
++	struct cache *ca = c->cache;
++	unsigned int sectors;
++
++	sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size;
++	atomic_long_add(sectors, &ca->meta_sectors_written);
++
++	memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9);
++
++	c->journal.key.ptr[0] += sectors << 9;
++	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
++
++	atomic_dec_bug(&fifo_back(&c->journal.pin));
++	bch_journal_next(&c->journal);
++	journal_reclaim(c);
++
++	spin_unlock(&c->journal.lock);
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
++static void journal_write_unlocked(struct closure *cl)
++{
++	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
++	struct cache *ca = c->cache;
++	struct journal_write *w = c->journal.cur;
++
++	if (!w->need_write) {
++		closure_return_with_destructor(cl, journal_write_unlock);
++		return;
++	} else if (journal_full(&c->journal)) {
++		journal_reclaim(c);
++		spin_unlock(&c->journal.lock);
++
++		btree_flush_write(c);
++		continue_at(cl, journal_write, bch_journal_wq);
++		return;
++	}
++
++	c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
++
++	w->data->btree_level = c->root->level;
++
++	bkey_copy(&w->data->btree_root, &c->root->key);
++	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
++
++	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
++	w->data->magic		= jset_magic(&ca->sb);
++	w->data->version	= BCACHE_JSET_VERSION;
++	w->data->last_seq	= last_seq(&c->journal);
++	w->data->csum		= csum_set(w->data);
++
++	if (!bch_has_feature_nvdimm_meta(&ca->sb))
++		__journal_write_unlocked(c);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++	else
++		__journal_nvdimm_write_unlocked(c);
++#endif
+ 
+ 	continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a27fa65d8832..45b69ddc9cfa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj)
+ static void cache_set_free(struct closure *cl)
+ {
+ 	struct cache_set *c = container_of(cl, struct cache_set, cl);
+-	struct cache *ca;
++	struct cache *ca = c->cache;
+ 
+ 	debugfs_remove(c->debug);
+ 
+@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl)
+ 	bch_bset_sort_state_free(&c->sort);
+ 	free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+ 
+-	ca = c->cache;
+ 	if (ca) {
+ 		ca->set = NULL;
+ 		c->cache = NULL;
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch
new file mode 100644
index 0000000..77ca2b5
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch
@@ -0,0 +1,182 @@
+From 2e1f37377d63412b139e8aa55a8731bf95c91767 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:54:12 +0800
+Subject: [PATCH v13 11/12] bcache: read jset from NVDIMM pages for journal
+ replay
+
+This patch implements two methods to read jset from media for journal
+replay,
+- __jnl_rd_bkt() for block device
+  This is the legacy method to read jset via block device interface.
+- __jnl_rd_nvm_bkt() for NVDIMM
+  This is the method to read jset from NVDIMM memory interface, a.k.a
+  memcopy() from NVDIMM pages to DRAM pages.
+
+If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set,
+during running cache set, journal_read_bucket() will read the journal
+content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of
+NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which
+were initialized and maintained in previous runs of the cache set.
+
+A thing should be noticed is, when bch_journal_read() is called, the
+linear address of NVDIMM pages is not loaded and initialized yet, it
+is necessary to call __bch_journal_nvdimm_init() before reading the jset
+from NVDIMM pages.
+
+The code comments added in journal_read_bucket() is noticed by kernel
+test robot and Dan Carpenter, it explains why it is safe to only check
+!bch_has_feature_nvdimm_meta() condition in the if() statement when
+CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the
+bogus warning message from static checking tool.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 71 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7d5c5ed18890..902992be9191 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx,
++				    unsigned int len, unsigned int offset,
++				    struct closure *cl)
++{
++	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]);
++	struct bio *bio = &ca->journal.bio;
++	struct jset *data = ca->set->journal.w[0].data;
++
++	bio_reset(bio);
++	bio->bi_iter.bi_sector	= bucket + offset;
++	bio_set_dev(bio, ca->bdev);
++	bio->bi_iter.bi_size	= len << 9;
++
++	bio->bi_end_io	= journal_read_endio;
++	bio->bi_private = cl;
++	bio_set_op_attrs(bio, REQ_OP_READ, 0);
++	bch_bio_map(bio, data);
++
++	closure_bio_submit(ca->set, bio, cl);
++	closure_sync(cl);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++
++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx,
++				     unsigned int len, unsigned int offset)
++{
++	void *jset_addr;
++	struct jset *data;
++
++	jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9);
++	data = ca->set->journal.w[0].data;
++
++	memcpy(data, jset_addr, len << 9);
++
++	/* Indeed journal.w[0].data */
++	return data;
++}
++
++#endif /* CONFIG_BCACHE_NVM_PAGES */
++
+ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ 			       unsigned int bucket_index)
+ {
+ 	struct journal_device *ja = &ca->journal;
+-	struct bio *bio = &ja->bio;
+ 
+ 	struct journal_replay *i;
+-	struct jset *j, *data = ca->set->journal.w[0].data;
++	struct jset *j;
+ 	struct closure cl;
+ 	unsigned int len, left, offset = 0;
+ 	int ret = 0;
+-	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+ 
+ 	closure_init_stack(&cl);
+ 
+@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ reread:		left = ca->sb.bucket_size - offset;
+ 		len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
+ 
+-		bio_reset(bio);
+-		bio->bi_iter.bi_sector	= bucket + offset;
+-		bio_set_dev(bio, ca->bdev);
+-		bio->bi_iter.bi_size	= len << 9;
+-
+-		bio->bi_end_io	= journal_read_endio;
+-		bio->bi_private = &cl;
+-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+-		bch_bio_map(bio, data);
+-
+-		closure_bio_submit(ca->set, bio, &cl);
+-		closure_sync(&cl);
++		if (!bch_has_feature_nvdimm_meta(&ca->sb))
++			j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl);
++		/*
++		 * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible
++		 * support feature set, a cache device format with feature bit
++		 * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in
++		 * read_super() by bch_has_unknown_incompat_features().
++		 * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is
++		 * safe to ignore the bch_has_feature_nvdimm_meta() condition.
++		 */
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		else
++			j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset);
++#endif
+ 
+ 		/* This function could be simpler now since we no longer write
+ 		 * journal entries that overlap bucket boundaries; this means
+ 		 * the start of a bucket will always have a valid journal entry
+ 		 * if it has any journal entries at all.
+ 		 */
+-
+-		j = data;
+ 		while (len) {
+ 			struct list_head *where;
+ 			size_t blocks, bytes = set_bytes(j);
+@@ -170,6 +213,8 @@ reread:		left = ca->sb.bucket_size - offset;
+ 	return ret;
+ }
+ 
++static int __bch_journal_nvdimm_init(struct cache *ca);
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b)							\
+@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ 	unsigned int i, l, r, m;
+ 	uint64_t seq;
+ 
++	/*
++	 * Linear addresses of NVDIMM pages for journaling is not
++	 * initialized yet, do it before read jset from NVDIMM pages.
++	 */
++	if (bch_has_feature_nvdimm_meta(&ca->sb)) {
++		if (__bch_journal_nvdimm_init(ca) < 0)
++			return -ENXIO;
++	}
++
+ 	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ 	pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
+ 
+-- 
+2.31.1
+
diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch
new file mode 100644
index 0000000..b2f0330
--- /dev/null
+++ b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch
@@ -0,0 +1,84 @@
+From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 24 Jul 2021 00:55:25 +0800
+Subject: [PATCH v13 12/12] bcache: add sysfs interface register_nvdimm_meta to
+ register NVDIMM meta device
+
+This patch adds a sysfs interface register_nvdimm_meta to register
+NVDIMM meta device. The sysfs interface file only shows up when
+CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by
+bcache-tools can be registered into bcache by e.g.,
+  echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Jianpeng Ma <jianpeng.ma@intel.com>
+Cc: Qiaowei Ren <qiaowei.ren@intel.com>
+---
+ drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 45b69ddc9cfa..2b9cde44879b 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ 					 struct kobj_attribute *attr,
+ 					 const char *buffer, size_t size);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k,
++				    struct kobj_attribute *attr,
++				    const char *buffer, size_t size);
++#endif
+ 
+ kobj_attribute_write(register,		register_bcache);
+ kobj_attribute_write(register_quiet,	register_bcache);
+ kobj_attribute_write(pendings_cleanup,	bch_pending_bdevs_cleanup);
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta);
++#endif
+ 
+ static bool bch_is_open_backing(dev_t dev)
+ {
+@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args)
+ 	queue_delayed_work(system_wq, &args->reg_work, 10);
+ }
+ 
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr,
++				    const char *buffer, size_t size)
++{
++	ssize_t ret = size;
++
++	struct bch_nvmpg_ns *ns = bch_register_namespace(buffer);
++
++	if (IS_ERR(ns)) {
++		pr_err("register nvdimm namespace %s for meta device failed.\n",
++			buffer);
++		ret = -EINVAL;
++	}
++
++	return ret;
++}
++#endif
++
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ 			       const char *buffer, size_t size)
+ {
+@@ -2864,6 +2890,9 @@ static int __init bcache_init(void)
+ 	static const struct attribute *files[] = {
+ 		&ksysfs_register.attr,
+ 		&ksysfs_register_quiet.attr,
++#if defined(CONFIG_BCACHE_NVM_PAGES)
++		&ksysfs_register_nvdimm_meta.attr,
++#endif
+ 		&ksysfs_pendings_cleanup.attr,
+ 		NULL
+ 	};
+-- 
+2.31.1
+
diff --git a/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch
new file mode 100644
index 0000000..d2727ea
--- /dev/null
+++ b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch
@@ -0,0 +1,166 @@
+From 8ddc4c14ecef71ebc56d86ad0fd6721d348898d0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 29 Mar 2022 00:08:49 +0800
+Subject: [PATCH] bcache: avoid unnecessary soft lockup in kworker
+ update_writeback_rate()
+
+The kworker routine update_writeback_rate() is schedued to update the
+writeback rate in every 5 seconds by default. Before calling
+__update_writeback_rate() to do real job, semaphore dc->writeback_lock
+should be held by the kworker routine.
+
+At the same time, bcache writeback thread routine bch_writeback_thread()
+also needs to hold dc->writeback_lock before flushing dirty data back
+into the backing device. If the dirty data set is large, it might be
+very long time for bch_writeback_thread() to scan all dirty buckets and
+releases dc->writeback_lock. In such case update_writeback_rate() can be
+starved for long enough time so that kernel reports a soft lockup warn-
+ing started like:
+  watchdog: BUG: soft lockup - CPU#246 stuck for 23s! [kworker/246:31:179713]
+
+Such soft lockup condition is unnecessary, because after the writeback
+thread finishes its job and releases dc->writeback_lock, the kworker
+update_writeback_rate() may continue to work and everything is fine
+indeed.
+
+This patch avoids the unnecessary soft lockup by the following method,
+- Add new members to struct cached_dev
+  - dc->retry_nr (0 by default)
+  - dc->retry_max (6 by default)
+- In update_writeback_rate() call down_read_trylock(&dc->writeback_lock)
+  firstly, if it fails then lock contention happens. If dc->retry_nr is
+  smaller than dc->retry_max, increase 1 to dc->retry_nr, and reschedule
+  the kworker to retry after a bit long time.
+- If lock contention happens and dc->retry_nr is equal to dc->retry_max,
+  no retry anymore and call down_read(&dc->writeback_lock) to wait for the
+  lock.
+
+By the above method, at worst case update_writeback_rate() may retry for
+2+ minutes before blocking on dc->writeback_lock by calling down_read().
+For a 4TB cache device with 1TB dirty data, 90%+ of the unnecessary soft
+lockup warning message can be avoided.
+
+When retrying to acquire dc->writeback_lock in update_writeback_rate(),
+of course the writeback rate cannot be updated. It is fair, because when
+the kworker is blocked on the lock contention of dc->writeback_lock, the
+writeback rate cannot be updated neither.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h    |  7 +++++
+ drivers/md/bcache/writeback.c | 49 +++++++++++++++++++++++++++++++----
+ 2 files changed, 51 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 9ed9c955add7..82b86b874294 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -395,6 +395,13 @@ struct cached_dev {
+ 	atomic_t		io_errors;
+ 	unsigned int		error_limit;
+ 	unsigned int		offline_seconds;
++
++	/*
++	 * Retry to update writeback_rate if contention happens for
++	 * down_read(dc->writeback_lock) in update_writeback_rate()
++	 */
++	unsigned int		retry_nr;
++	unsigned int		retry_max;
+ };
+ 
+ enum alloc_reserve {
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 9ee0005874cd..dbe90b9b2940 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -214,6 +214,7 @@ static void update_writeback_rate(struct work_struct *work)
+ 					     struct cached_dev,
+ 					     writeback_rate_update);
+ 	struct cache_set *c = dc->disk.c;
++	bool contention = false;
+ 
+ 	/*
+ 	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+@@ -235,6 +236,7 @@ static void update_writeback_rate(struct work_struct *work)
+ 		return;
+ 	}
+ 
++
+ 	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ 		/*
+ 		 * If the whole cache set is idle, set_at_max_writeback_rate()
+@@ -243,13 +245,44 @@ static void update_writeback_rate(struct work_struct *work)
+ 		 * in maximum writeback rate number(s).
+ 		 */
+ 		if (!set_at_max_writeback_rate(c, dc)) {
+-			down_read(&dc->writeback_lock);
+-			__update_writeback_rate(dc);
+-			update_gc_after_writeback(c);
+-			up_read(&dc->writeback_lock);
++			/*
++			 * When contention happens on dc->writeback_lock with
++			 * the writeback thread, this kwork may be blocked for
++			 * very long time if there are too many dirty data to
++			 * writeback, and kerne message will complain a (bogus)
++			 * software lockup kernel message. To avoid potential
++			 * starving, if down_read_trylock() fails, writeback
++			 * rate updating will be skipped for dc->retry_max times
++			 * at most while delay this worker a bit longer time.
++			 * If dc->retry_max times are tried and the trylock
++			 * still fails, then call down_read() to wait for
++			 * dc->writeback_lock.
++			 */
++			if (!down_read_trylock((&dc->writeback_lock))) {
++				contention = true;
++
++				if (dc->retry_nr < dc->retry_max) {
++					dc->retry_nr++;
++				} else {
++					down_read(&dc->writeback_lock);
++					dc->retry_nr = 0;
++				}
++			}
++
++			if (!dc->retry_nr) {
++				__update_writeback_rate(dc);
++				update_gc_after_writeback(c);
++				up_read(&dc->writeback_lock);
++			}
+ 		}
+ 	}
+ 
++	/*
++	 * In case no lock contention on dc->writeback_lock happens since
++	 * last retry, e.g. cache is clean or I/O idle for a while.
++	 */
++	if (!contention && dc->retry_nr)
++		dc->retry_nr = 0;
+ 
+ 	/*
+ 	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+@@ -257,8 +290,10 @@ static void update_writeback_rate(struct work_struct *work)
+ 	 */
+ 	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
+ 	    !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
++		unsigned int scale = 1 + dc->retry_nr;
++
+ 		schedule_delayed_work(&dc->writeback_rate_update,
+-			      dc->writeback_rate_update_seconds * HZ);
++			dc->writeback_rate_update_seconds * scale * HZ);
+ 	}
+ 
+ 	/*
+@@ -1032,6 +1067,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ 	dc->writeback_rate_fp_term_high = 1000;
+ 	dc->writeback_rate_i_term_inverse = 10000;
+ 
++	/* For dc->writeback_lock contention in update_writeback_rate() */
++	dc->retry_nr = 0;
++	dc->retry_max = 6;
++
+ 	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+ }
+-- 
+2.34.1
+
diff --git a/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch
new file mode 100644
index 0000000..b1b4bae
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch
@@ -0,0 +1,92 @@
+From db29a2e95f4ad4ec1ba58a71203a60ebd867d8c9 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 10:57:26 +0800
+Subject: [PATCH 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+  This structure is used in improved badblocks code for bad table
+  iteration.
+- BB_END()
+  The macro to culculate end LBA of a bad range record from bad
+  table.
+- badblocks_full() and badblocks_empty()
+  The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+  The inline routines to set and clear 'changed' tag from struct
+  badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 32 ++++++++++++++++++++++++++++++++
+ 1 file changed, 32 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..e1a06bacb2a2 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
++#define BB_END(x)	(BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+ 
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,14 @@ struct badblocks {
+ 	sector_t size;		/* in sectors */
+ };
+ 
++struct badblocks_context {
++	sector_t	start;
++	sector_t	len;
++	int		ack;
++	sector_t	orig_start;
++	sector_t	orig_len;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 		   sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +72,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ 	}
+ 	badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++	return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++	return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++	if (bb->changed != 1)
++		bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++	if (bb->changed != 0)
++		bb->changed = 0;
++}
++
+ #endif
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch
new file mode 100644
index 0000000..62198ee
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch
@@ -0,0 +1,456 @@
+From d24ea1527077d06b0b579bbf7d1128d94af15d70 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+  range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+  manipulating range and existing bad block range from bad table.
+  - can_merge_behind()
+    Return 'true' if the manipulating range can backward merge with the
+    bad block range.
+  - can_merge_front()
+    Return 'true' if the manipulating range can forward merge with the
+    bad block range.
+  - can_combine_front()
+    Return 'true' if two adjacent bad block ranges before the
+    manipulating range can be merged.
+  - overlap_front()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range in front of its range.
+  - overlap_behind()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range behind its range.
+  - can_front_overwrite()
+    Return 'true' if the manipulating range can forward overwrite the
+    bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+  block table. Different routine is called with the specific relative
+  layout between the maniplating range and other bad block range in the
+  bad block table.
+  - behind_merge()
+    Merge the maniplating range with the bad block range behind its
+    range, and return the number of merged length in unit of sector.
+  - front_merge()
+    Merge the maniplating range with the bad block range in front of
+    its range, and return the number of merged length in unit of sector.
+  - front_combine()
+    Combine the two adjacent bad block ranges before the manipulating
+    range into a larger one.
+  - front_overwrite()
+    Overwrite partial of whole bad block range which is in front of the
+    manipulating range. The overwrite may split existing bad block range
+    and generate more bad block ranges into the bad block table.
+  - insert_at()
+    Insert the manipulating range at a specific location in the bad
+    block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 374 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..e85a7cd23aad 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,380 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++	int hint_end = hint + 2;
++	u64 *p = bb->page;
++	int ret = -1;
++
++	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++	       (BB_OFFSET(p[hint]) <= s)) {
++		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++			ret = hint;
++			break;
++		}
++		hint++;
++	}
++
++	return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++			  int hint)
++{
++	sector_t s = bad->start;
++	int ret = -1;
++	int lo, hi;
++	u64 *p;
++
++	if (!bb->count)
++		goto out;
++
++	if (hint >= 0) {
++		ret = prev_by_hint(bb, s, hint);
++		if (ret >= 0)
++			goto out;
++	}
++
++	lo = 0;
++	hi = bb->count;
++	p = bb->page;
++
++	while (hi - lo > 1) {
++		int mid = (lo + hi)/2;
++		sector_t a = BB_OFFSET(p[mid]);
++
++		if (a <= s)
++			lo = mid;
++		else
++			hi = mid;
++	}
++
++	if (BB_OFFSET(p[lo]) <= s)
++		ret = lo;
++out:
++	return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++			     int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++
++	if ((s <= BB_OFFSET(p[behind])) &&
++	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
++	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++	    BB_ACK(p[behind]) == ack)
++		return true;
++	return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++			int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_OFFSET(p[behind]));
++	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++	if (s < BB_OFFSET(p[behind])) {
++		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++		merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, ack);
++	} else {
++		merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++	}
++
++	WARN_ON(merged == 0);
++
++	return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++			    struct badblocks_context *bad)
++{
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++
++	if (BB_ACK(p[prev]) == ack &&
++	    (s < BB_END(p[prev]) ||
++	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++		return true;
++	return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_END(p[prev]));
++
++	if (s < BB_END(p[prev])) {
++		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++	} else {
++		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++		if ((prev + 1) < bb->count &&
++		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++		}
++
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + merged, ack);
++	}
++
++	return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++			      struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if ((prev > 0) &&
++	    (BB_OFFSET(p[prev]) == bad->start) &&
++	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++		return true;
++	return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++	u64 *p = bb->page;
++
++	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++			      BB_ACK(p[prev]));
++	if ((prev + 1) < bb->count)
++		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++			  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if (bad->start >= BB_OFFSET(p[front]) &&
++	    bad->start < BB_END(p[front]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++			   int behind)
++{
++	u64 *p = bb->page;
++
++	if (bad->start < BB_OFFSET(p[behind]) &&
++	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ *    range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ *    range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ *    extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ *    ranges will be split (ahead and after the overwritten range) and
++ *    added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++				struct badblocks_context *bad, int *extra)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(!overlap_front(bb, prev, bad));
++
++	if (BB_ACK(p[prev]) >= bad->ack)
++		return false;
++
++	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++		len = BB_END(p[prev]) - bad->start;
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 0;
++		else
++			*extra = 1;
++
++		bad->len = len;
++	} else {
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 1;
++		else
++		/*
++		 * prev range will be split into two, beside the overwritten
++		 * one, an extra slot needed from bad table.
++		 */
++			*extra = 2;
++	}
++
++	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++		return false;
++
++	return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++			   struct badblocks_context *bad, int extra)
++{
++	u64 *p = bb->page;
++	sector_t orig_end = BB_END(p[prev]);
++	int orig_ack = BB_ACK(p[prev]);
++	int n = extra;
++
++	switch (extra) {
++	case 0:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++				  bad->ack);
++		break;
++	case 1:
++		if (BB_OFFSET(p[prev]) == bad->start) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->len, bad->ack);
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start + bad->len,
++					      orig_end - BB_END(p[prev]),
++					      orig_ack);
++		} else {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->start - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++			memmove(p + prev + 1 + n, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		}
++		break;
++	case 2:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  bad->start - BB_OFFSET(p[prev]),
++				  BB_ACK(p[prev]));
++		memmove(p + prev + 1 + n, p + prev + 1,
++			(bb->count - prev - 1) * 8);
++		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++				      orig_end - BB_END(p[prev + 1]),
++				      BB_ACK(p[prev]));
++		break;
++	default:
++		break;
++	}
++
++	return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(badblocks_full(bb));
++
++	len = min_t(sector_t, sectors, BB_MAX_LEN);
++	if (at < bb->count)
++		memmove(p + at + 1, p + at, (bb->count - at) * 8);
++	p[at] = BB_MAKE(s, len, ack);
++
++	return len;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch
new file mode 100644
index 0000000..31a7639
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch
@@ -0,0 +1,662 @@
+From b3bbd59d07b131df82410b615ed13a7c439bbd32 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 18:36:09 +0800
+Subject: [PATCH 3/6] badblocks: improvement badblocks_set() for multiple
+ ranges handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+	badblocks_set(bb, 32, 1, true);
+	badblocks_set(bb, 34, 1, true);
+	badblocks_set(bb, 36, 1, true);
+	badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+	32 3
+	36 1
+But the expected bad blocks table should be,
+	32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+  blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be ackknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinationsinto to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 561 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 541 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index e85a7cd23aad..95dceed0da3c 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlaypped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restritctions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficut and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to habndle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ *  prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ *    block range.
++ *                         +--------+
++ *                         |    S   |
++ *                         +--------+
++ *        +-------------+               +-------------+
++ *        |      E1     |               |      E2     |
++ *        +-------------+               +-------------+
++ *    For this situation if the bad blocks table is not full, just allocate a
++ *    free slot from the bad blocks table to mark the setting range S. The
++ *    result is,
++ *        +-------------+  +--------+   +-------------+
++ *        |      E1     |  |    S   |   |      E2     |
++ *        +-------------+  +--------+   +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ *    range.
++ * 2.1) The setting range size < already set range size
++ *        +--------+
++ *        |    S   |
++ *        +--------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.1.2) If S is uncked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E.
++ *    An extra slot from the bad blocks table will be allocated for S, and head
++ *    of E will move to end of the inserted range E. The result is,
++ *        +--------+----+
++ *        |    S   | E  |
++ *        +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.2.2) If S is uncked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of
++      bad blocks range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.3) The setting range size > already set range size
++ *        +-------------------+
++ *        |          S        |
++ *        +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For such situation, the setting range S can be treated as two parts, the
++ *    first part (S1) is as same size as the already set range E, the second
++ *    part (S2) is the rest of setting range.
++ *        +-------------+-----+        +-------------+       +-----+
++ *        |    S1       | S2  |        |     S1      |       | S2  |
++ *        +-------------+-----+  ===>  +-------------+       +-----+
++ *        +-------------+              +-------------+
++ *        |      E      |              |      E      |
++ *        +-------------+              +-------------+
++ *    Now we only focus on how to handle the setting range S1 and already set
++ *    range E, which are already explained in 1.2), for the rest S2 it will be
++ *    handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ *    range.
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ *             +-------------+
++ *             |      E      |
++ *             +-------------+
++ *    For this situation, the setting range S can be divided into two parts, the
++ *    first (S1) ends at the start LBA of already set range E, the second part
++ *    (S2) starts exactly at a start LBA of the already set range E.
++ *        +----+---------+             +----+      +---------+
++ *        | S1 |    S2   |             | S1 |      |    S2   |
++ *        +----+---------+      ===>   +----+      +---------+
++ *             +-------------+                     +-------------+
++ *             |      E      |                     |      E      |
++ *             +-------------+                     +-------------+
++ *    Now only the first part S1 should be handled in this loop, which is in
++ *    similar condition as 1). The rest part S2 has exact same start LBA address
++ *    of the already set range E, they will be handled in next loop in one of
++ *    situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ *    range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ *    blocks range E, like the following chart shows,
++ *            +---------+
++ *            |   S     |
++ *            +---------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ *    overwrite the overlapped range of E, the result is,
++ *        +---+---------+
++ *        | E |    S    |
++ *        +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ *    the following chart shows,
++ *             +----+
++ *             | S  |
++ *             +----+
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is also,
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ *    inserted into middle of E and split previous range E into twp parts (E1
++ *    and E2), the result is,
++ *        +----+----+----+
++ *        | E1 |  S | E2 |
++ *        +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ *    blocks range E. The range S starts after the start LBA of range E, and
++ *    ends after the end LBA of range E, as the following chart shows,
++ *            +-------------------+
++ *            |          S        |
++ *            +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For this situation the range S can be divided into two parts, the first
++ *    part (S1) ends at end range E, and the second part (S2) has rest range of
++ *    origin S.
++ *            +---------+---------+            +---------+      +---------+
++ *            |    S1   |    S2   |            |    S1   |      |    S2   |
++ *            +---------+---------+  ===>      +---------+      +---------+
++ *        +-------------+                  +-------------+
++ *        |      E      |                  |      E      |
++ *        +-------------+                  +-------------+
++ *     Now in this loop the setting range S1 and already set range E can be
++ *     handled as the situations 4), the rest range S2 will be handled in next
++ *     loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ *    blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ *    range S and they are adjacent,
++ *                +------+
++ *                |  S   |
++ *                +------+
++ *        +-------+
++ *        |   E   |
++ *        +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can front merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ *    range S right after already set range E into the bad blocks table. The
++ *    result is,
++ *        +--------+------+
++ *        |   E    |   S  |
++ *        +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ *        +-------------------------------------------------------+
++ *        |                           S                           |
++ *        +-------------------------------------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+     +-----+   +-----+
++ *                                 | E1  |     | E2  |   | E3  |
++ *                                 +-----+     +-----+   +-----+
++ *     In the above example, when the bad blocks table is full, inserting the
++ *     first part of setting range S will fail because no more available slot
++ *     can be allocated from bad blocks table. In this situation a proper
++ *     setting method should be go though all the setting bad blocks range and
++ *     look for chance to merge already set ranges into less ones. When there
++ *     is available slot from bad blocks table, re-try again to handle more
++ *     setting bad blocks ranges as many as possible.
++ *        +------------------------+
++ *        |          S3            |
++ *        +------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+-----+-----+---+-----+--+
++ *                                 |       S1        |     S2     |
++ *                                 +-----+-----+-----+---+-----+--+
++ *     The above chart shows although the first part (S3) cannot be inserted due
++ *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ *     can be merged with rest part of S into less range S1 and S2. Now there is
++ *     1 free slot in bad blocks table.
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *        |           S3           |       S1        |     S2     |
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *     Since the bad blocks table is not full anymore, re-try again for the
++ *     origin setting range S. Now the setting range S3 can be inserted into the
++ *     bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ *    In the following example, in bad blocks table, E1 is an acked bad blocks
++ *    range and E2 is an unacked bad blocks range, therefore they are not able
++ *    to merge into a larger range. The setting bad blocks range S is acked,
++ *    therefore part of E2 can be overwritten by S.
++ *                      +--------+
++ *                      |    S   |                             acknowledged
++ *                      +--------+                         S:       1
++ *              +-------+-------------+                   E1:       1
++ *              |   E1  |    E2       |                   E2:       0
++ *              +-------+-------------+
++ *     With previosu simplified routines, after overwiting part of E2 with S,
++ *     the bad blocks table should be (E3 is remaining part of E2 which is not
++ *     overwritten by S),
++ *                                                             acknowledged
++ *              +-------+--------+----+                    S:       1
++ *              |   E1  |    S   | E3 |                   E1:       1
++ *              +-------+--------+----+                   E3:       0
++ *     The above result is correct but not perfect. Range E1 and S in the bad
++ *     blocks table are all acked, merging them into a larger one range may
++ *     occupy less bad blocks table space and make badblocks_check() faster.
++ *     Therefore in such situation, after overwiting range S, the previous range
++ *     E1 should be checked for possible front combination. Then the ideal
++ *     result can be,
++ *              +----------------+----+                        acknowledged
++ *              |       E1       | E3 |                   E1:       1
++ *              +----------------+----+                   E3:       0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ *    range S and they are adjacent. Normally we don't need to care about this
++ *    because front merge handles this while going though range S from head to
++ *    tail, except for the tail part of range S. When the setting range S are
++ *    fully handled, all the above simplified routine doesn't check whether the
++ *    tail LBA of range S is adjacent to the next already set range and not able
++ *    to them if they are mergeable.
++ *        +------+
++ *        |  S   |
++ *        +------+
++ *               +-------+
++ *               |   E   |
++ *               +-------+
++ *    For the above special stiuation, when the setting range S are all handled
++ *    and the loop ends, an extra check is necessary for whether next already
++ *    set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can behind merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ *     S infront of the already set range E in the bad blocks table. The result
++ *     is,
++ *        +------+-------+
++ *        |  S   |   E   |
++ *        +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+  * Find the range starts at-or-before 's' from bad table. The search
+  * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -390,6 +706,231 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ 	return len;
+ }
+ 
++static void badblocks_update_acked(struct badblocks *bb)
++{
++	u64 *p = bb->page;
++	int i;
++	bool unacked = false;
++
++	if (!bb->unacked_exist)
++		return;
++
++	for (i = 0; i < bb->count ; i++) {
++		if (!BB_ACK(p[i])) {
++			unacked = true;
++			break;
++		}
++	}
++
++	if (!unacked)
++		bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++			  int acknowledged)
++{
++	u64 *p;
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	int len = 0, added = 0;
++	int retried = 0, space_desired = 0;
++	int rv = 0;
++	unsigned long flags;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		/* round the start down, and the end up */
++		sector_t next = s + sectors;
++
++		rounddown(s, bb->shift);
++		roundup(next, bb->shift);
++		sectors = next - s;
++	}
++
++	write_seqlock_irqsave(&bb->lock, flags);
++
++	bad.orig_start = s;
++	bad.orig_len = sectors;
++	bad.ack = acknowledged;
++	p = bb->page;
++
++re_insert:
++	bad.start = s;
++	bad.len = sectors;
++	len = 0;
++
++	if (badblocks_empty(bb)) {
++		len = insert_at(bb, 0, &bad);
++		bb->count++;
++		added++;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start before all badblocks */
++	if (prev < 0) {
++		if (!badblocks_full(bb)) {
++			/* insert on the first */
++			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++				bad.len = BB_OFFSET(p[0]) - bad.start;
++			len = insert_at(bb, 0, &bad);
++			bb->count++;
++			added++;
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* No sapce, try to merge */
++		if (overlap_behind(bb, &bad, 0)) {
++			if (can_merge_behind(bb, &bad, 0)) {
++				len = behind_merge(bb, &bad, 0);
++				added++;
++			} else {
++				len = min_t(sector_t,
++					    BB_OFFSET(p[0]) - s, sectors);
++				space_desired = 1;
++			}
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* no table space and give up */
++		goto out;
++	}
++
++	/* in case p[prev-1] can be merged with p[prev] */
++	if (can_combine_front(bb, prev, &bad)) {
++		front_combine(bb, prev);
++		bb->count--;
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (can_merge_front(bb, prev, &bad)) {
++			len = front_merge(bb, prev, &bad);
++			added++;
++			hint = prev;
++		} else {
++			int extra = 0;
++
++			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++				len = min_t(sector_t,
++					    BB_END(p[prev]) - s, sectors);
++				hint = prev;
++				goto update_sectors;
++			}
++
++			len = front_overwrite(bb, prev, &bad, extra);
++			added++;
++			bb->count += extra;
++			hint = prev;
++
++			if (can_combine_front(bb, prev, &bad)) {
++				front_combine(bb, prev);
++				bb->count--;
++				hint = prev - 1;
++			}
++		}
++		goto update_sectors;
++	}
++
++	if (can_merge_front(bb, prev, &bad)) {
++		len = front_merge(bb, prev, &bad);
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	/* if no space in table, still try to merge in the covered range */
++	if (badblocks_full(bb)) {
++		/* skip the cannot-merge range */
++		if (((prev + 1) < bb->count) &&
++		    overlap_behind(bb, &bad, prev + 1) &&
++		    ((s + sectors) >= BB_END(p[prev + 1]))) {
++			len = BB_END(p[prev + 1]) - s;
++			hint = prev + 1;
++			goto update_sectors;
++		}
++
++		/* no retry any more */
++		len = sectors;
++		space_desired = 1;
++		hint = -1;
++		goto update_sectors;
++	}
++
++	/* cannot merge and there is space in bad table */
++	if ((prev + 1) < bb->count &&
++	    overlap_behind(bb, &bad, prev + 1))
++		bad.len = min_t(sector_t,
++				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++	len = insert_at(bb, prev + 1, &bad);
++	bb->count++;
++	added++;
++	hint = prev + 1;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_insert;
++
++	WARN_ON(sectors < 0);
++
++	/* Check whether the following already set range can be merged */
++	if ((prev + 1) < bb->count &&
++	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++				  BB_ACK(p[prev]));
++
++		if ((prev + 2) < bb->count)
++			memmove(p + prev + 1, p + prev + 2,
++				(bb->count -  (prev + 2)) * 8);
++		bb->count--;
++	}
++
++	if (space_desired && !badblocks_full(bb)) {
++		s = bad.orig_start;
++		sectors = bad.orig_len;
++		space_desired = 0;
++		if (retried++ < 3)
++			goto re_insert;
++	}
++
++out:
++	if (added) {
++		set_changed(bb);
++
++		if (!acknowledged)
++			bb->unacked_exist = 1;
++		else
++			badblocks_update_acked(bb);
++	}
++
++	write_sequnlock_irqrestore(&bb->lock, flags);
++
++	if (!added)
++		rv = 1;
++
++	return rv;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+@@ -499,26 +1040,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+-	u64 *p = bb->page;
+-	int i;
+-	bool unacked = false;
+-
+-	if (!bb->unacked_exist)
+-		return;
+-
+-	for (i = 0; i < bb->count ; i++) {
+-		if (!BB_ACK(p[i])) {
+-			unacked = true;
+-			break;
+-		}
+-	}
+-
+-	if (!unacked)
+-		bb->unacked_exist = 0;
+-}
+-
+ /**
+  * badblocks_set() - Add a range of bad blocks to the table.
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch
new file mode 100644
index 0000000..4cbfd5e
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch
@@ -0,0 +1,401 @@
+From b75e0792f127a99f068d635421ffac52843b488c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH 4/6] badblocks: improve badblocks_clear() for multiple ranges
+ handling
+
+With the foundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+  Clear the bad block range from bad block table which is front
+  overlapped with the clearing range.
+- front_splitting_clear()
+  Handle the condition that the clearing range hits middle of an
+  already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are categried
+and how all the bad block range clearing conditions are handled by these
+five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 327 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 95dceed0da3c..b9a4cd64b840 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+  * avoided. In my test with the hint to prev_badblocks(), except for the first
+  * loop, all rested calls to prev_badblocks() can go into the fast path and
+  * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearning range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to beconsidered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ *    table.
++ *    +-----+         |          +-----+         |          +-----+
++ *    |  C  |         |          |  C  |         |          |  C  |
++ *    +-----+         or         +-----+         or         +-----+
++ *            +---+   |   +----+         +----+  |  +---+
++ *            | E |   |   | E1 |         | E2 |  |  | E |
++ *            +---+   |   +----+         +----+  |  +---+
++ *    For the above situations, no bad block to be cleared and no failure
++ *    happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ *    the bad block table.
++ *            +---+
++ *            | C |
++ *            +---+
++ *     +-----------------+
++ *     |         E       |
++ *     +-----------------+
++ *    In this situation if the bad block table is not full, the range E will be
++ *    split into two ranges E1 and E2. The result is,
++ *     +------+   +------+
++ *     |  E1  |   |  E2  |
++ *     +------+   +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ *    from the bad block table.
++ * 3.1) Partially covered at head part
++ *         +------------+
++ *         |     C      |
++ *         +------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation, the overlapped already set range will update the
++ *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ *    item deleted from bad block table. The result is,
++ *                      +----+
++ *                      | E1 |
++ *                      +----+
++ * 3.2) Exact fully covered
++ *         +-----------------+
++ *         |         C       |
++ *         +-----------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation the whole bad blocks range E will be cleared and its
++ *    corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ *    range.
++ *                   +-------+
++ *                   |   C   |
++ *                   +-------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For the above situation, the already set range E is updated to shrink its
++ *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ *    The result is,
++ *         +---------+
++ *         |    E    |
++ *         +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ *    range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ *    range.
++ *         +----------+
++ *         |     C    |
++ *         +----------+
++ *              +------------+
++ *              |      E     |
++ *              +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part ends at the start LBA of range E, and the second part starts at
++ *   same LBA of range E.
++ *         +----+-----+               +----+   +-----+
++ *         | C1 | C2  |               | C1 |   | C2  |
++ *         +----+-----+         ===>  +----+   +-----+
++ *              +------------+                 +------------+
++ *              |      E     |                 |      E     |
++ *              +------------+                 +------------+
++ *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ *   handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ *   range.
++ *                 +----------+
++ *                 |     C    |
++ *                 +----------+
++ *         +------------+
++ *         |      E     |
++ *         +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part C1 ends at same end LBA of range E, and the second part starts
++ *   at end LBA of range E.
++ *                 +----+-----+                 +----+    +-----+
++ *                 | C1 | C2  |                 | C1 |    | C2  |
++ *                 +----+-----+  ===>           +----+    +-----+
++ *         +------------+               +------------+
++ *         |      E     |               |      E     |
++ *         +------------+               +------------+
++ *   Now the first part clearing range C1 can be handled as condition 4), and
++ *   the second part clearing range C2 can be handled as condition 1) in next
++ *   loop.
++ *
++ *   All bad blocks range clearing can be simplified into the above 5 situations
++ *   by only handling the head part of the clearing range in each run of the
++ *   while-loop. The idea is similar to bad blocks range setting but much
++ *   simpler.
+  */
+ 
+ /*
+@@ -931,6 +1048,216 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 	return rv;
+ }
+ 
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++		       struct badblocks_context *bad, int *deleted)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int cleared = 0;
++
++	*deleted = 0;
++	if (s == BB_OFFSET(p[prev])) {
++		if (BB_LEN(p[prev]) > sectors) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++					  BB_LEN(p[prev]) - sectors,
++					  BB_ACK(p[prev]));
++			cleared = sectors;
++		} else {
++			/* BB_LEN(p[prev]) <= sectors */
++			cleared = BB_LEN(p[prev]);
++			if ((prev + 1) < bb->count)
++				memmove(p + prev, p + prev + 1,
++				       (bb->count - prev - 1) * 8);
++			*deleted = 1;
++		}
++	} else if (s > BB_OFFSET(p[prev])) {
++		if (BB_END(p[prev]) <= (s + sectors)) {
++			cleared = BB_END(p[prev]) - s;
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  s - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++		} else {
++			/* Splitting is handled in front_splitting_clear() */
++			BUG();
++		}
++	}
++
++	return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++				  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	u64 end = BB_END(p[prev]);
++	int ack = BB_ACK(p[prev]);
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++
++	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++			  s - BB_OFFSET(p[prev]),
++			  ack);
++	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++	return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	int len = 0, cleared = 0;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		sector_t target;
++
++		/* When clearing we round the start up and the end down.
++		 * This should not matter as the shift should align with
++		 * the block size and no rounding should ever be needed.
++		 * However it is better the think a block is bad when it
++		 * isn't than to think a block is not bad when it is.
++		 */
++		target = s + sectors;
++		roundup(s, bb->shift);
++		rounddown(target, bb->shift);
++		sectors = target - s;
++	}
++
++	write_seqlock_irq(&bb->lock);
++
++	bad.orig_start = s;
++	bad.orig_len = sectors;
++	bad.ack = true;
++	p = bb->page;
++
++re_clear:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* Start before all badblocks */
++	if (prev < 0) {
++		if (overlap_behind(bb, &bad, 0)) {
++			len = BB_OFFSET(p[0]) - s;
++			hint = prev;
++		} else {
++			len = sectors;
++		}
++		/*
++		 * Both situations are to clear non-bad range,
++		 * should be treated as successful
++		 */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Clear will split a bad record but the table is full */
++	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++	    (BB_END(p[prev]) > (bad.start + sectors))) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if ((BB_OFFSET(p[prev]) < bad.start) &&
++		    (BB_END(p[prev]) > (bad.start + bad.len))) {
++			/* Splitting */
++			if ((bb->count + 1) < MAX_BADBLOCKS) {
++				len = front_splitting_clear(bb, prev, &bad);
++				bb->count += 1;
++				cleared++;
++			} else {
++				/* No space to split, give up */
++				len = sectors;
++			}
++		} else {
++			int deleted = 0;
++
++			len = front_clear(bb, prev, &bad, &deleted);
++			bb->count -= deleted;
++			cleared++;
++			hint = prev;
++		}
++
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		/* Clear non-bad range should be treated as successful */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Not cover any badblocks range in the table */
++	len = sectors;
++	/* Clear non-bad range should be treated as successful */
++	cleared++;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_clear;
++
++	WARN_ON(sectors < 0);
++
++	if (cleared) {
++		badblocks_update_acked(bb);
++		set_changed(bb);
++	}
++
++	write_sequnlock_irq(&bb->lock);
++
++	if (!cleared)
++		rv = 1;
++
++	return rv;
++}
++
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch
new file mode 100644
index 0000000..6be1249
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch
@@ -0,0 +1,177 @@
+From 09092ea11f2a8d319ac57865031190f153d159ae Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 09:27:06 +0800
+Subject: [PATCH 5/6] badblocks: improve badblocks_check() for multiple ranges
+ handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+  +------------------------------------+
+  |                C                   |
+  +------------------------------------+
+    +----+      +----+      +----+
+    | E1 |      | E2 |      | E3 |
+    +----+      +----+      +----+
+The improved badblocks_check() algorithm will divid checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+  |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+       +----+        +----+        +----+
+       | E1 |        | E2 |        | E3 |
+       +----+        +----+        +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+   Index No.     Start        Len         Ack
+       0          400          20          1
+       1          500          50          1
+       2          650          20          0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+    Checking Start, Len     Return Value   first_bad    bad_sectors
+               100, 100          0           N/A           N/A
+               100, 310          1           400           10
+               100, 440          1           400           10
+               100, 540          1           400           10
+               100, 600         -1           400           10
+               100, 800         -1           400           10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 99 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index b9a4cd64b840..5a1ac35b924a 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1257,6 +1257,105 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ 	return rv;
+ }
+ 
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++			    sector_t *first_bad, int *bad_sectors)
++{
++	int unacked_badblocks, acked_badblocks;
++	int prev = -1, hint = -1, set = 0;
++	struct badblocks_context bad;
++	unsigned int seq;
++	int len, rv;
++	u64 *p;
++
++	WARN_ON(bb->shift < 0 || sectors == 0);
++
++	if (bb->shift > 0) {
++		sector_t target;
++
++		/* round the start down, and the end up */
++		target = s + sectors;
++		rounddown(s, bb->shift);
++		roundup(target, bb->shift);
++		sectors = target - s;
++	}
++
++retry:
++	seq = read_seqbegin(&bb->lock);
++
++	bad.orig_start = s;
++	bad.orig_len = sectors;
++	p = bb->page;
++	unacked_badblocks = 0;
++	acked_badblocks = 0;
++
++re_check:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (BB_ACK(p[prev]))
++			acked_badblocks++;
++		else
++			unacked_badblocks++;
++
++		if (BB_END(p[prev]) >= (s + sectors))
++			len = sectors;
++		else
++			len = BB_END(p[prev]) - s;
++
++		if (set == 0) {
++			*first_bad = BB_OFFSET(p[prev]);
++			*bad_sectors = BB_LEN(p[prev]);
++			set = 1;
++		}
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		goto update_sectors;
++	}
++
++	/* not cover any badblocks range in the table */
++	len = sectors;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_check;
++
++	WARN_ON(sectors < 0);
++
++	if (unacked_badblocks > 0)
++		rv = -1;
++	else if (acked_badblocks > 0)
++		rv = 1;
++	else
++		rv = 0;
++
++	if (read_seqretry(&bb->lock, seq))
++		goto retry;
++
++	return rv;
++}
+ 
+ /**
+  * badblocks_check() - check a given range for bad sectors
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch
new file mode 100644
index 0000000..6d07398
--- /dev/null
+++ b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch
@@ -0,0 +1,364 @@
+From f81bac5e10aa50c8245c605c363f7d4de21e318a Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH 6/6] badblocks: switch to the improved badblock handling code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in  _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 5a1ac35b924a..5ab03cfdc0b7 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1394,75 +1394,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 			sector_t *first_bad, int *bad_sectors)
+ {
+-	int hi;
+-	int lo;
+-	u64 *p = bb->page;
+-	int rv;
+-	sector_t target = s + sectors;
+-	unsigned seq;
+-
+-	if (bb->shift > 0) {
+-		/* round the start down, and the end up */
+-		s >>= bb->shift;
+-		target += (1<<bb->shift) - 1;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-	/* 'target' is now the first block after the bad range */
+-
+-retry:
+-	seq = read_seqbegin(&bb->lock);
+-	lo = 0;
+-	rv = 0;
+-	hi = bb->count;
+-
+-	/* Binary search between lo and hi for 'target'
+-	 * i.e. for the last range that starts before 'target'
+-	 */
+-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+-	 * are known not to be the last range before target.
+-	 * VARIANT: hi-lo is the number of possible
+-	 * ranges, and decreases until it reaches 1
+-	 */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			/* This could still be the one, earlier ranges
+-			 * could not.
+-			 */
+-			lo = mid;
+-		else
+-			/* This and later ranges are definitely out. */
+-			hi = mid;
+-	}
+-	/* 'lo' might be the last that started before target, but 'hi' isn't */
+-	if (hi > lo) {
+-		/* need to check all range that end after 's' to see if
+-		 * any are unacknowledged.
+-		 */
+-		while (lo >= 0 &&
+-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+-			if (BB_OFFSET(p[lo]) < target) {
+-				/* starts before the end, and finishes after
+-				 * the start, so they must overlap
+-				 */
+-				if (rv != -1 && BB_ACK(p[lo]))
+-					rv = 1;
+-				else
+-					rv = -1;
+-				*first_bad = BB_OFFSET(p[lo]);
+-				*bad_sectors = BB_LEN(p[lo]);
+-			}
+-			lo--;
+-		}
+-	}
+-
+-	if (read_seqretry(&bb->lock, seq))
+-		goto retry;
+-
+-	return rv;
++	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+@@ -1484,154 +1416,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 			int acknowledged)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	int rv = 0;
+-	unsigned long flags;
+-
+-	if (bb->shift < 0)
+-		/* badblocks are disabled */
+-		return 1;
+-
+-	if (bb->shift) {
+-		/* round the start down, and the end up */
+-		sector_t next = s + sectors;
+-
+-		s >>= bb->shift;
+-		next += (1<<bb->shift) - 1;
+-		next >>= bb->shift;
+-		sectors = next - s;
+-	}
+-
+-	write_seqlock_irqsave(&bb->lock, flags);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts at-or-before 's' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a <= s)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo && BB_OFFSET(p[lo]) > s)
+-		hi = lo;
+-
+-	if (hi > lo) {
+-		/* we found a range that might merge with the start
+-		 * of our new range
+-		 */
+-		sector_t a = BB_OFFSET(p[lo]);
+-		sector_t e = a + BB_LEN(p[lo]);
+-		int ack = BB_ACK(p[lo]);
+-
+-		if (e >= s) {
+-			/* Yes, we can merge with a previous range */
+-			if (s == a && s + sectors >= e)
+-				/* new range covers old */
+-				ack = acknowledged;
+-			else
+-				ack = ack && acknowledged;
+-
+-			if (e < s + sectors)
+-				e = s + sectors;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[lo] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				/* does not all fit in one range,
+-				 * make p[lo] maximal
+-				 */
+-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
+-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-		}
+-	}
+-	if (sectors && hi < bb->count) {
+-		/* 'hi' points to the first range that starts after 's'.
+-		 * Maybe we can merge with the start of that range
+-		 */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		sector_t e = a + BB_LEN(p[hi]);
+-		int ack = BB_ACK(p[hi]);
+-
+-		if (a <= s + sectors) {
+-			/* merging is possible */
+-			if (e <= s + sectors) {
+-				/* full overlap */
+-				e = s + sectors;
+-				ack = acknowledged;
+-			} else
+-				ack = ack && acknowledged;
+-
+-			a = s;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[hi] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-			lo = hi;
+-			hi++;
+-		}
+-	}
+-	if (sectors == 0 && hi < bb->count) {
+-		/* we might be able to combine lo and hi */
+-		/* Note: 's' is at the end of 'lo' */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		int lolen = BB_LEN(p[lo]);
+-		int hilen = BB_LEN(p[hi]);
+-		int newlen = lolen + hilen - (s - a);
+-
+-		if (s >= a && newlen < BB_MAX_LEN) {
+-			/* yes, we can combine them */
+-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+-			memmove(p + hi, p + hi + 1,
+-				(bb->count - hi - 1) * 8);
+-			bb->count--;
+-		}
+-	}
+-	while (sectors) {
+-		/* didn't merge (it all).
+-		 * Need to add a range just before 'hi'
+-		 */
+-		if (bb->count >= MAX_BADBLOCKS) {
+-			/* No room for more */
+-			rv = 1;
+-			break;
+-		} else {
+-			int this_sectors = sectors;
+-
+-			memmove(p + hi + 1, p + hi,
+-				(bb->count - hi) * 8);
+-			bb->count++;
+-
+-			if (this_sectors > BB_MAX_LEN)
+-				this_sectors = BB_MAX_LEN;
+-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+-			sectors -= this_sectors;
+-			s += this_sectors;
+-		}
+-	}
+-
+-	bb->changed = 1;
+-	if (!acknowledged)
+-		bb->unacked_exist = 1;
+-	else
+-		badblocks_update_acked(bb);
+-	write_sequnlock_irqrestore(&bb->lock, flags);
+-
+-	return rv;
++	return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+ 
+@@ -1651,96 +1436,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+  */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	sector_t target = s + sectors;
+-	int rv = 0;
+-
+-	if (bb->shift > 0) {
+-		/* When clearing we round the start up and the end down.
+-		 * This should not matter as the shift should align with
+-		 * the block size and no rounding should ever be needed.
+-		 * However it is better the think a block is bad when it
+-		 * isn't than to think a block is not bad when it is.
+-		 */
+-		s += (1<<bb->shift) - 1;
+-		s >>= bb->shift;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-
+-	write_seqlock_irq(&bb->lock);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts before 'target' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo) {
+-		/* p[lo] is the last range that could overlap the
+-		 * current range.  Earlier ranges could also overlap,
+-		 * but only this one can overlap the end of the range.
+-		 */
+-		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+-		    (BB_OFFSET(p[lo]) < target)) {
+-			/* Partial overlap, leave the tail of this range */
+-			int ack = BB_ACK(p[lo]);
+-			sector_t a = BB_OFFSET(p[lo]);
+-			sector_t end = a + BB_LEN(p[lo]);
+-
+-			if (a < s) {
+-				/* we need to split this range */
+-				if (bb->count >= MAX_BADBLOCKS) {
+-					rv = -ENOSPC;
+-					goto out;
+-				}
+-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+-				bb->count++;
+-				p[lo] = BB_MAKE(a, s-a, ack);
+-				lo++;
+-			}
+-			p[lo] = BB_MAKE(target, end - target, ack);
+-			/* there is no longer an overlap */
+-			hi = lo;
+-			lo--;
+-		}
+-		while (lo >= 0 &&
+-		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+-		       (BB_OFFSET(p[lo]) < target)) {
+-			/* This range does overlap */
+-			if (BB_OFFSET(p[lo]) < s) {
+-				/* Keep the early parts of this range. */
+-				int ack = BB_ACK(p[lo]);
+-				sector_t start = BB_OFFSET(p[lo]);
+-
+-				p[lo] = BB_MAKE(start, s - start, ack);
+-				/* now low doesn't overlap, so.. */
+-				break;
+-			}
+-			lo--;
+-		}
+-		/* 'lo' is strictly before, 'hi' is strictly after,
+-		 * anything between needs to be discarded
+-		 */
+-		if (hi - lo > 1) {
+-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+-			bb->count -= (hi - lo - 1);
+-		}
+-	}
+-
+-	badblocks_update_acked(bb);
+-	bb->changed = 1;
+-out:
+-	write_sequnlock_irq(&bb->lock);
+-	return rv;
++	return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+ 
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0000-cover-letter.patch b/for-test/badblocks/v4/v4-0000-cover-letter.patch
new file mode 100644
index 0000000..c02f896
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0000-cover-letter.patch
@@ -0,0 +1,70 @@
+From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 19:05:12 +0800
+Subject: [RESEND PATCH v4 0/6] badblocks improvement for multiple bad block ranges
+
+Hi Dan,
+
+This is the v4 effort to improve badblocks code APIs to handle multiple
+ranges in bad block table.
+
+Comparing to v3 series, the v4 series modification is for code review
+comments from Geliang Tang,
+- Declare local variables in reverse Xmas tree order.
+- Drop orig_start and orig_len from struct badblocks_context.
+- Fix typos in code comments.
+- in badblocks_set() avoid one unnecessary loop by setting variable
+  hint by prev (was prev - 1 in v3 series).
+
+There is NO in-memory or on-disk format change in the whole series, all
+existing API and data structures are consistent. This series just only
+improve the code algorithm to handle more corner cases, the interfaces
+are same and consistency to all existing callers (md raid and nvdimm
+drivers).
+
+The original motivation of the change is from the requirement from our
+customer, that current badblocks routines don't handle multiple ranges.
+For example if the bad block setting range covers multiple ranges from
+bad block table, only the first two bad block ranges merged and rested
+ranges are intact. The expected behavior should be all the covered
+ranges to be handled.
+
+All the patches are tested by modified user space code and the code
+logic works as expected. The modified user space testing code is
+provided in last patch. The testing code is an example how the improved
+code is tested.
+
+The whole change is divided into 6 patches to make the code review more
+clear and easier. If people prefer, I'd like to post a single large
+patch finally after the code review accomplished.
+
+Please review the code and response. Thank you all in advance.
+
+Coly Li
+
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+
+Coly Li (6):
+  badblocks: add more helper structure and routines in badblocks.h
+  badblocks: add helper routines for badblock ranges handling
+  badblocks: improvement badblocks_set() for multiple ranges handling
+  badblocks: improve badblocks_clear() for multiple ranges handling
+  badblocks: improve badblocks_check() for multiple ranges handling
+  badblocks: switch to the improved badblock handling code
+Coly Li (1):
+  test: user space code to test badblocks APIs
+
+ block/badblocks.c         | 1602 ++++++++++++++++++++++++++++++-------
+ include/linux/badblocks.h |   30 +
+ 2 files changed, 1337 insertions(+), 295 deletions(-)
+
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch
new file mode 100644
index 0000000..f008556
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch
@@ -0,0 +1,91 @@
+From 4b3441cc612192914fdf57a8ae3f71479ff3793f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:29:38 +0800
+Subject: [PATCH v4 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+  This structure is used in improved badblocks code for bad table
+  iteration.
+- BB_END()
+  The macro to calculate end LBA of a bad range record from bad
+  table.
+- badblocks_full() and badblocks_empty()
+  The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+  The inline routines to set and clear 'changed' tag from struct
+  badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..670f2dae692f 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
++#define BB_END(x)	(BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+ 
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,12 @@ struct badblocks {
+ 	sector_t size;		/* in sectors */
+ };
+ 
++struct badblocks_context {
++	sector_t	start;
++	sector_t	len;
++	int		ack;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 		   sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ 	}
+ 	badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++	return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++	return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++	if (bb->changed != 1)
++		bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++	if (bb->changed != 0)
++		bb->changed = 0;
++}
++
+ #endif
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
new file mode 100644
index 0000000..46116bb
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
@@ -0,0 +1,457 @@
+From 69aa03e6aa9eb441a3b4bc7c3d017c064d6d821b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH v4 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+  range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+  manipulating range and existing bad block range from bad table.
+  - can_merge_behind()
+    Return 'true' if the manipulating range can backward merge with the
+    bad block range.
+  - can_merge_front()
+    Return 'true' if the manipulating range can forward merge with the
+    bad block range.
+  - can_combine_front()
+    Return 'true' if two adjacent bad block ranges before the
+    manipulating range can be merged.
+  - overlap_front()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range in front of its range.
+  - overlap_behind()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range behind its range.
+  - can_front_overwrite()
+    Return 'true' if the manipulating range can forward overwrite the
+    bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+  block table. Different routine is called with the specific relative
+  layout between the manipulating range and other bad block range in the
+  bad block table.
+  - behind_merge()
+    Merge the manipulating range with the bad block range behind its
+    range, and return the number of merged length in unit of sector.
+  - front_merge()
+    Merge the manipulating range with the bad block range in front of
+    its range, and return the number of merged length in unit of sector.
+  - front_combine()
+    Combine the two adjacent bad block ranges before the manipulating
+    range into a larger one.
+  - front_overwrite()
+    Overwrite partial of whole bad block range which is in front of the
+    manipulating range. The overwrite may split existing bad block range
+    and generate more bad block ranges into the bad block table.
+  - insert_at()
+    Insert the manipulating range at a specific location in the bad
+    block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 374 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..e216c6791b4b 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,380 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++	int hint_end = hint + 2;
++	u64 *p = bb->page;
++	int ret = -1;
++
++	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++	       (BB_OFFSET(p[hint]) <= s)) {
++		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++			ret = hint;
++			break;
++		}
++		hint++;
++	}
++
++	return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++			  int hint)
++{
++	sector_t s = bad->start;
++	int ret = -1;
++	int lo, hi;
++	u64 *p;
++
++	if (!bb->count)
++		goto out;
++
++	if (hint >= 0) {
++		ret = prev_by_hint(bb, s, hint);
++		if (ret >= 0)
++			goto out;
++	}
++
++	lo = 0;
++	hi = bb->count;
++	p = bb->page;
++
++	while (hi - lo > 1) {
++		int mid = (lo + hi)/2;
++		sector_t a = BB_OFFSET(p[mid]);
++
++		if (a <= s)
++			lo = mid;
++		else
++			hi = mid;
++	}
++
++	if (BB_OFFSET(p[lo]) <= s)
++		ret = lo;
++out:
++	return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++			     int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++
++	if ((s <= BB_OFFSET(p[behind])) &&
++	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
++	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++	    BB_ACK(p[behind]) == ack)
++		return true;
++	return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++			int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_OFFSET(p[behind]));
++	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++	if (s < BB_OFFSET(p[behind])) {
++		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++		merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, ack);
++	} else {
++		merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++	}
++
++	WARN_ON(merged == 0);
++
++	return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++			    struct badblocks_context *bad)
++{
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++
++	if (BB_ACK(p[prev]) == ack &&
++	    (s < BB_END(p[prev]) ||
++	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++		return true;
++	return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_END(p[prev]));
++
++	if (s < BB_END(p[prev])) {
++		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++	} else {
++		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++		if ((prev + 1) < bb->count &&
++		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++		}
++
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + merged, ack);
++	}
++
++	return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++			      struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if ((prev > 0) &&
++	    (BB_OFFSET(p[prev]) == bad->start) &&
++	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++		return true;
++	return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++	u64 *p = bb->page;
++
++	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++			      BB_ACK(p[prev]));
++	if ((prev + 1) < bb->count)
++		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++			  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if (bad->start >= BB_OFFSET(p[front]) &&
++	    bad->start < BB_END(p[front]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++			   int behind)
++{
++	u64 *p = bb->page;
++
++	if (bad->start < BB_OFFSET(p[behind]) &&
++	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ *    range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ *    range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ *    extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ *    ranges will be split (ahead and after the overwritten range) and
++ *    added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++				struct badblocks_context *bad, int *extra)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(!overlap_front(bb, prev, bad));
++
++	if (BB_ACK(p[prev]) >= bad->ack)
++		return false;
++
++	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++		len = BB_END(p[prev]) - bad->start;
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 0;
++		else
++			*extra = 1;
++
++		bad->len = len;
++	} else {
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 1;
++		else
++		/*
++		 * prev range will be split into two, beside the overwritten
++		 * one, an extra slot needed from bad table.
++		 */
++			*extra = 2;
++	}
++
++	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++		return false;
++
++	return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++			   struct badblocks_context *bad, int extra)
++{
++	u64 *p = bb->page;
++	sector_t orig_end = BB_END(p[prev]);
++	int orig_ack = BB_ACK(p[prev]);
++	int n = extra;
++
++	switch (extra) {
++	case 0:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++				  bad->ack);
++		break;
++	case 1:
++		if (BB_OFFSET(p[prev]) == bad->start) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->len, bad->ack);
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start + bad->len,
++					      orig_end - BB_END(p[prev]),
++					      orig_ack);
++		} else {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->start - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++			memmove(p + prev + 1 + n, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		}
++		break;
++	case 2:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  bad->start - BB_OFFSET(p[prev]),
++				  BB_ACK(p[prev]));
++		memmove(p + prev + 1 + n, p + prev + 1,
++			(bb->count - prev - 1) * 8);
++		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++				      orig_end - BB_END(p[prev + 1]),
++				      BB_ACK(p[prev]));
++		break;
++	default:
++		break;
++	}
++
++	return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	int ack = bad->ack;
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(badblocks_full(bb));
++
++	len = min_t(sector_t, sectors, BB_MAX_LEN);
++	if (at < bb->count)
++		memmove(p + at + 1, p + at, (bb->count - at) * 8);
++	p[at] = BB_MAKE(s, len, ack);
++
++	return len;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch
new file mode 100644
index 0000000..cd732d0
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch
@@ -0,0 +1,661 @@
+From c6d337537fae982c4d24ce626436e32a2f71e5f8 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:57:50 +0800
+Subject: [PATCH v4 3/6] badblocks: improve badblocks_set() for multiple ranges handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+        badblocks_set(bb, 32, 1, true);
+        badblocks_set(bb, 34, 1, true);
+        badblocks_set(bb, 36, 1, true);
+        badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+        32 3
+        36 1
+But the expected bad blocks table should be,
+        32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+  blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be acknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinations into to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 540 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index e216c6791b4b..13eaad18be15 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ *  prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ *    block range.
++ *                         +--------+
++ *                         |    S   |
++ *                         +--------+
++ *        +-------------+               +-------------+
++ *        |      E1     |               |      E2     |
++ *        +-------------+               +-------------+
++ *    For this situation if the bad blocks table is not full, just allocate a
++ *    free slot from the bad blocks table to mark the setting range S. The
++ *    result is,
++ *        +-------------+  +--------+   +-------------+
++ *        |      E1     |  |    S   |   |      E2     |
++ *        +-------------+  +--------+   +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ *    range.
++ * 2.1) The setting range size < already set range size
++ *        +--------+
++ *        |    S   |
++ *        +--------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ *    An extra slot from the bad blocks table will be allocated for S, and head
++ *    of E will move to end of the inserted range S. The result is,
++ *        +--------+----+
++ *        |    S   | E  |
++ *        +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++      bad blocks range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.3) The setting range size > already set range size
++ *        +-------------------+
++ *        |          S        |
++ *        +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For such situation, the setting range S can be treated as two parts, the
++ *    first part (S1) is as same size as the already set range E, the second
++ *    part (S2) is the rest of setting range.
++ *        +-------------+-----+        +-------------+       +-----+
++ *        |    S1       | S2  |        |     S1      |       | S2  |
++ *        +-------------+-----+  ===>  +-------------+       +-----+
++ *        +-------------+              +-------------+
++ *        |      E      |              |      E      |
++ *        +-------------+              +-------------+
++ *    Now we only focus on how to handle the setting range S1 and already set
++ *    range E, which are already explained in 2.2), for the rest S2 it will be
++ *    handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ *    range.
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ *             +-------------+
++ *             |      E      |
++ *             +-------------+
++ *    For this situation, the setting range S can be divided into two parts, the
++ *    first (S1) ends at the start LBA of already set range E, the second part
++ *    (S2) starts exactly at a start LBA of the already set range E.
++ *        +----+---------+             +----+      +---------+
++ *        | S1 |    S2   |             | S1 |      |    S2   |
++ *        +----+---------+      ===>   +----+      +---------+
++ *             +-------------+                     +-------------+
++ *             |      E      |                     |      E      |
++ *             +-------------+                     +-------------+
++ *    Now only the first part S1 should be handled in this loop, which is in
++ *    similar condition as 1). The rest part S2 has exact same start LBA address
++ *    of the already set range E, they will be handled in next loop in one of
++ *    situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ *    range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ *    blocks range E, like the following chart shows,
++ *            +---------+
++ *            |   S     |
++ *            +---------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ *    overwrite the overlapped range of E, the result is,
++ *        +---+---------+
++ *        | E |    S    |
++ *        +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ *    the following chart shows,
++ *             +----+
++ *             | S  |
++ *             +----+
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is also,
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ *    inserted into middle of E and split previous range E into twp parts (E1
++ *    and E2), the result is,
++ *        +----+----+----+
++ *        | E1 |  S | E2 |
++ *        +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ *    blocks range E. The range S starts after the start LBA of range E, and
++ *    ends after the end LBA of range E, as the following chart shows,
++ *            +-------------------+
++ *            |          S        |
++ *            +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For this situation the range S can be divided into two parts, the first
++ *    part (S1) ends at end range E, and the second part (S2) has rest range of
++ *    origin S.
++ *            +---------+---------+            +---------+      +---------+
++ *            |    S1   |    S2   |            |    S1   |      |    S2   |
++ *            +---------+---------+  ===>      +---------+      +---------+
++ *        +-------------+                  +-------------+
++ *        |      E      |                  |      E      |
++ *        +-------------+                  +-------------+
++ *     Now in this loop the setting range S1 and already set range E can be
++ *     handled as the situations 4), the rest range S2 will be handled in next
++ *     loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ *    blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ *    range S and they are adjacent,
++ *                +------+
++ *                |  S   |
++ *                +------+
++ *        +-------+
++ *        |   E   |
++ *        +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can front merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ *    range S right after already set range E into the bad blocks table. The
++ *    result is,
++ *        +--------+------+
++ *        |   E    |   S  |
++ *        +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ *        +-------------------------------------------------------+
++ *        |                           S                           |
++ *        +-------------------------------------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+     +-----+   +-----+
++ *                                 | E1  |     | E2  |   | E3  |
++ *                                 +-----+     +-----+   +-----+
++ *     In the above example, when the bad blocks table is full, inserting the
++ *     first part of setting range S will fail because no more available slot
++ *     can be allocated from bad blocks table. In this situation a proper
++ *     setting method should be go though all the setting bad blocks range and
++ *     look for chance to merge already set ranges into less ones. When there
++ *     is available slot from bad blocks table, re-try again to handle more
++ *     setting bad blocks ranges as many as possible.
++ *        +------------------------+
++ *        |          S3            |
++ *        +------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+-----+-----+---+-----+--+
++ *                                 |       S1        |     S2     |
++ *                                 +-----+-----+-----+---+-----+--+
++ *     The above chart shows although the first part (S3) cannot be inserted due
++ *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ *     can be merged with rest part of S into less range S1 and S2. Now there is
++ *     1 free slot in bad blocks table.
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *        |           S3           |       S1        |     S2     |
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *     Since the bad blocks table is not full anymore, re-try again for the
++ *     origin setting range S. Now the setting range S3 can be inserted into the
++ *     bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ *    In the following example, in bad blocks table, E1 is an acked bad blocks
++ *    range and E2 is an unacked bad blocks range, therefore they are not able
++ *    to merge into a larger range. The setting bad blocks range S is acked,
++ *    therefore part of E2 can be overwritten by S.
++ *                      +--------+
++ *                      |    S   |                             acknowledged
++ *                      +--------+                         S:       1
++ *              +-------+-------------+                   E1:       1
++ *              |   E1  |    E2       |                   E2:       0
++ *              +-------+-------------+
++ *     With previous simplified routines, after overwriting part of E2 with S,
++ *     the bad blocks table should be (E3 is remaining part of E2 which is not
++ *     overwritten by S),
++ *                                                             acknowledged
++ *              +-------+--------+----+                    S:       1
++ *              |   E1  |    S   | E3 |                   E1:       1
++ *              +-------+--------+----+                   E3:       0
++ *     The above result is correct but not perfect. Range E1 and S in the bad
++ *     blocks table are all acked, merging them into a larger one range may
++ *     occupy less bad blocks table space and make badblocks_check() faster.
++ *     Therefore in such situation, after overwriting range S, the previous range
++ *     E1 should be checked for possible front combination. Then the ideal
++ *     result can be,
++ *              +----------------+----+                        acknowledged
++ *              |       E1       | E3 |                   E1:       1
++ *              +----------------+----+                   E3:       0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ *    range S and they are adjacent. Normally we don't need to care about this
++ *    because front merge handles this while going though range S from head to
++ *    tail, except for the tail part of range S. When the setting range S are
++ *    fully handled, all the above simplified routine doesn't check whether the
++ *    tail LBA of range S is adjacent to the next already set range and not able
++ *    to them if they are mergeable.
++ *        +------+
++ *        |  S   |
++ *        +------+
++ *               +-------+
++ *               |   E   |
++ *               +-------+
++ *    For the above special situation, when the setting range S are all handled
++ *    and the loop ends, an extra check is necessary for whether next already
++ *    set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can behind merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ *     S in front of the already set range E in the bad blocks table. The result
++ *     is,
++ *        +------+-------+
++ *        |  S   |   E   |
++ *        +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+  * Find the range starts at-or-before 's' from bad table. The search
+  * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -390,6 +706,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ 	return len;
+ }
+ 
++static void badblocks_update_acked(struct badblocks *bb)
++{
++	bool unacked = false;
++	u64 *p = bb->page;
++	int i;
++
++	if (!bb->unacked_exist)
++		return;
++
++	for (i = 0; i < bb->count ; i++) {
++		if (!BB_ACK(p[i])) {
++			unacked = true;
++			break;
++		}
++	}
++
++	if (!unacked)
++		bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++			  int acknowledged)
++{
++	int retried = 0, space_desired = 0;
++	int orig_len, len = 0, added = 0;
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	sector_t orig_start;
++	unsigned long flags;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		/* round the start down, and the end up */
++		sector_t next = s + sectors;
++
++		rounddown(s, bb->shift);
++		roundup(next, bb->shift);
++		sectors = next - s;
++	}
++
++	write_seqlock_irqsave(&bb->lock, flags);
++
++	orig_start = s;
++	orig_len = sectors;
++	bad.ack = acknowledged;
++	p = bb->page;
++
++re_insert:
++	bad.start = s;
++	bad.len = sectors;
++	len = 0;
++
++	if (badblocks_empty(bb)) {
++		len = insert_at(bb, 0, &bad);
++		bb->count++;
++		added++;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start before all badblocks */
++	if (prev < 0) {
++		if (!badblocks_full(bb)) {
++			/* insert on the first */
++			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++				bad.len = BB_OFFSET(p[0]) - bad.start;
++			len = insert_at(bb, 0, &bad);
++			bb->count++;
++			added++;
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* No sapce, try to merge */
++		if (overlap_behind(bb, &bad, 0)) {
++			if (can_merge_behind(bb, &bad, 0)) {
++				len = behind_merge(bb, &bad, 0);
++				added++;
++			} else {
++				len = min_t(sector_t,
++					    BB_OFFSET(p[0]) - s, sectors);
++				space_desired = 1;
++			}
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* no table space and give up */
++		goto out;
++	}
++
++	/* in case p[prev-1] can be merged with p[prev] */
++	if (can_combine_front(bb, prev, &bad)) {
++		front_combine(bb, prev);
++		bb->count--;
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (can_merge_front(bb, prev, &bad)) {
++			len = front_merge(bb, prev, &bad);
++			added++;
++		} else {
++			int extra = 0;
++
++			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++				len = min_t(sector_t,
++					    BB_END(p[prev]) - s, sectors);
++				hint = prev;
++				goto update_sectors;
++			}
++
++			len = front_overwrite(bb, prev, &bad, extra);
++			added++;
++			bb->count += extra;
++
++			if (can_combine_front(bb, prev, &bad)) {
++				front_combine(bb, prev);
++				bb->count--;
++			}
++		}
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (can_merge_front(bb, prev, &bad)) {
++		len = front_merge(bb, prev, &bad);
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	/* if no space in table, still try to merge in the covered range */
++	if (badblocks_full(bb)) {
++		/* skip the cannot-merge range */
++		if (((prev + 1) < bb->count) &&
++		    overlap_behind(bb, &bad, prev + 1) &&
++		    ((s + sectors) >= BB_END(p[prev + 1]))) {
++			len = BB_END(p[prev + 1]) - s;
++			hint = prev + 1;
++			goto update_sectors;
++		}
++
++		/* no retry any more */
++		len = sectors;
++		space_desired = 1;
++		hint = -1;
++		goto update_sectors;
++	}
++
++	/* cannot merge and there is space in bad table */
++	if ((prev + 1) < bb->count &&
++	    overlap_behind(bb, &bad, prev + 1))
++		bad.len = min_t(sector_t,
++				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++	len = insert_at(bb, prev + 1, &bad);
++	bb->count++;
++	added++;
++	hint = prev + 1;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_insert;
++
++	WARN_ON(sectors < 0);
++
++	/* Check whether the following already set range can be merged */
++	if ((prev + 1) < bb->count &&
++	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++				  BB_ACK(p[prev]));
++
++		if ((prev + 2) < bb->count)
++			memmove(p + prev + 1, p + prev + 2,
++				(bb->count -  (prev + 2)) * 8);
++		bb->count--;
++	}
++
++	if (space_desired && !badblocks_full(bb)) {
++		s = orig_start;
++		sectors = orig_len;
++		space_desired = 0;
++		if (retried++ < 3)
++			goto re_insert;
++	}
++
++out:
++	if (added) {
++		set_changed(bb);
++
++		if (!acknowledged)
++			bb->unacked_exist = 1;
++		else
++			badblocks_update_acked(bb);
++	}
++
++	write_sequnlock_irqrestore(&bb->lock, flags);
++
++	if (!added)
++		rv = 1;
++
++	return rv;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+@@ -499,26 +1039,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+-	u64 *p = bb->page;
+-	int i;
+-	bool unacked = false;
+-
+-	if (!bb->unacked_exist)
+-		return;
+-
+-	for (i = 0; i < bb->count ; i++) {
+-		if (!BB_ACK(p[i])) {
+-			unacked = true;
+-			break;
+-		}
+-	}
+-
+-	if (!unacked)
+-		bb->unacked_exist = 0;
+-}
+-
+ /**
+  * badblocks_set() - Add a range of bad blocks to the table.
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
new file mode 100644
index 0000000..ad5cfc3
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
@@ -0,0 +1,399 @@
+From a7120f4e3a771de6f6c682798b0e9ebf3c6fcb49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH v4 4/6] badblocks: improve badblocks_clear() for multiple
+ ranges handling
+
+With the fundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+  Clear the bad block range from bad block table which is front
+  overlapped with the clearing range.
+- front_splitting_clear()
+  Handle the condition that the clearing range hits middle of an
+  already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are
+categrized and how all the bad block range clearing conditions are
+handled by these five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 325 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 13eaad18be15..c188b2e98140 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+  * avoided. In my test with the hint to prev_badblocks(), except for the first
+  * loop, all rested calls to prev_badblocks() can go into the fast path and
+  * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ *    table.
++ *    +-----+         |          +-----+         |          +-----+
++ *    |  C  |         |          |  C  |         |          |  C  |
++ *    +-----+         or         +-----+         or         +-----+
++ *            +---+   |   +----+         +----+  |  +---+
++ *            | E |   |   | E1 |         | E2 |  |  | E |
++ *            +---+   |   +----+         +----+  |  +---+
++ *    For the above situations, no bad block to be cleared and no failure
++ *    happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ *    the bad block table.
++ *            +---+
++ *            | C |
++ *            +---+
++ *     +-----------------+
++ *     |         E       |
++ *     +-----------------+
++ *    In this situation if the bad block table is not full, the range E will be
++ *    split into two ranges E1 and E2. The result is,
++ *     +------+   +------+
++ *     |  E1  |   |  E2  |
++ *     +------+   +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ *    from the bad block table.
++ * 3.1) Partially covered at head part
++ *         +------------+
++ *         |     C      |
++ *         +------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation, the overlapped already set range will update the
++ *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ *    item deleted from bad block table. The result is,
++ *                      +----+
++ *                      | E1 |
++ *                      +----+
++ * 3.2) Exact fully covered
++ *         +-----------------+
++ *         |         C       |
++ *         +-----------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation the whole bad blocks range E will be cleared and its
++ *    corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ *    range.
++ *                   +-------+
++ *                   |   C   |
++ *                   +-------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For the above situation, the already set range E is updated to shrink its
++ *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ *    The result is,
++ *         +---------+
++ *         |    E    |
++ *         +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ *    range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ *    range.
++ *         +----------+
++ *         |     C    |
++ *         +----------+
++ *              +------------+
++ *              |      E     |
++ *              +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part ends at the start LBA of range E, and the second part starts at
++ *   same LBA of range E.
++ *         +----+-----+               +----+   +-----+
++ *         | C1 | C2  |               | C1 |   | C2  |
++ *         +----+-----+         ===>  +----+   +-----+
++ *              +------------+                 +------------+
++ *              |      E     |                 |      E     |
++ *              +------------+                 +------------+
++ *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ *   handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ *   range.
++ *                 +----------+
++ *                 |     C    |
++ *                 +----------+
++ *         +------------+
++ *         |      E     |
++ *         +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part C1 ends at same end LBA of range E, and the second part starts
++ *   at end LBA of range E.
++ *                 +----+-----+                 +----+    +-----+
++ *                 | C1 | C2  |                 | C1 |    | C2  |
++ *                 +----+-----+  ===>           +----+    +-----+
++ *         +------------+               +------------+
++ *         |      E     |               |      E     |
++ *         +------------+               +------------+
++ *   Now the first part clearing range C1 can be handled as condition 4), and
++ *   the second part clearing range C2 can be handled as condition 1) in next
++ *   loop.
++ *
++ *   All bad blocks range clearing can be simplified into the above 5 situations
++ *   by only handling the head part of the clearing range in each run of the
++ *   while-loop. The idea is similar to bad blocks range setting but much
++ *   simpler.
+  */
+ 
+ /*
+@@ -930,6 +1047,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 	return rv;
+ }
+ 
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++		       struct badblocks_context *bad, int *deleted)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int cleared = 0;
++
++	*deleted = 0;
++	if (s == BB_OFFSET(p[prev])) {
++		if (BB_LEN(p[prev]) > sectors) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++					  BB_LEN(p[prev]) - sectors,
++					  BB_ACK(p[prev]));
++			cleared = sectors;
++		} else {
++			/* BB_LEN(p[prev]) <= sectors */
++			cleared = BB_LEN(p[prev]);
++			if ((prev + 1) < bb->count)
++				memmove(p + prev, p + prev + 1,
++				       (bb->count - prev - 1) * 8);
++			*deleted = 1;
++		}
++	} else if (s > BB_OFFSET(p[prev])) {
++		if (BB_END(p[prev]) <= (s + sectors)) {
++			cleared = BB_END(p[prev]) - s;
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  s - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++		} else {
++			/* Splitting is handled in front_splitting_clear() */
++			BUG();
++		}
++	}
++
++	return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++				  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	u64 end = BB_END(p[prev]);
++	int ack = BB_ACK(p[prev]);
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++
++	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++			  s - BB_OFFSET(p[prev]),
++			  ack);
++	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++	return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	int len = 0, cleared = 0;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		sector_t target;
++
++		/* When clearing we round the start up and the end down.
++		 * This should not matter as the shift should align with
++		 * the block size and no rounding should ever be needed.
++		 * However it is better the think a block is bad when it
++		 * isn't than to think a block is not bad when it is.
++		 */
++		target = s + sectors;
++		roundup(s, bb->shift);
++		rounddown(target, bb->shift);
++		sectors = target - s;
++	}
++
++	write_seqlock_irq(&bb->lock);
++
++	bad.ack = true;
++	p = bb->page;
++
++re_clear:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* Start before all badblocks */
++	if (prev < 0) {
++		if (overlap_behind(bb, &bad, 0)) {
++			len = BB_OFFSET(p[0]) - s;
++			hint = prev;
++		} else {
++			len = sectors;
++		}
++		/*
++		 * Both situations are to clear non-bad range,
++		 * should be treated as successful
++		 */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Clear will split a bad record but the table is full */
++	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++	    (BB_END(p[prev]) > (bad.start + sectors))) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if ((BB_OFFSET(p[prev]) < bad.start) &&
++		    (BB_END(p[prev]) > (bad.start + bad.len))) {
++			/* Splitting */
++			if ((bb->count + 1) < MAX_BADBLOCKS) {
++				len = front_splitting_clear(bb, prev, &bad);
++				bb->count += 1;
++				cleared++;
++			} else {
++				/* No space to split, give up */
++				len = sectors;
++			}
++		} else {
++			int deleted = 0;
++
++			len = front_clear(bb, prev, &bad, &deleted);
++			bb->count -= deleted;
++			cleared++;
++			hint = prev;
++		}
++
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		/* Clear non-bad range should be treated as successful */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Not cover any badblocks range in the table */
++	len = sectors;
++	/* Clear non-bad range should be treated as successful */
++	cleared++;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_clear;
++
++	WARN_ON(sectors < 0);
++
++	if (cleared) {
++		badblocks_update_acked(bb);
++		set_changed(bb);
++	}
++
++	write_sequnlock_irq(&bb->lock);
++
++	if (!cleared)
++		rv = 1;
++
++	return rv;
++}
++
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
new file mode 100644
index 0000000..e519560
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
@@ -0,0 +1,175 @@
+From 88b4c165ef9827f0febe7a527faea2a0d99feb66 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 16:13:35 +0800
+Subject: [PATCH v4 5/6] badblocks: improve badblocks_check() for multiple
+ ranges handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+  +------------------------------------+
+  |                C                   |
+  +------------------------------------+
+    +----+      +----+      +----+
+    | E1 |      | E2 |      | E3 |
+    +----+      +----+      +----+
+The improved badblocks_check() algorithm will divide checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+  |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+       +----+        +----+        +----+
+       | E1 |        | E2 |        | E3 |
+       +----+        +----+        +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+   Index No.     Start        Len         Ack
+       0          400          20          1
+       1          500          50          1
+       2          650          20          0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+    Checking Start, Len     Return Value   first_bad    bad_sectors
+               100, 100          0           N/A           N/A
+               100, 310          1           400           10
+               100, 440          1           400           10
+               100, 540          1           400           10
+               100, 600         -1           400           10
+               100, 800         -1           400           10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 97 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index c188b2e98140..f16c54925275 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1254,6 +1254,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ 	return rv;
+ }
+ 
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++			    sector_t *first_bad, int *bad_sectors)
++{
++	int unacked_badblocks, acked_badblocks;
++	int prev = -1, hint = -1, set = 0;
++	struct badblocks_context bad;
++	unsigned int seq;
++	int len, rv;
++	u64 *p;
++
++	WARN_ON(bb->shift < 0 || sectors == 0);
++
++	if (bb->shift > 0) {
++		sector_t target;
++
++		/* round the start down, and the end up */
++		target = s + sectors;
++		rounddown(s, bb->shift);
++		roundup(target, bb->shift);
++		sectors = target - s;
++	}
++
++retry:
++	seq = read_seqbegin(&bb->lock);
++
++	p = bb->page;
++	unacked_badblocks = 0;
++	acked_badblocks = 0;
++
++re_check:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (BB_ACK(p[prev]))
++			acked_badblocks++;
++		else
++			unacked_badblocks++;
++
++		if (BB_END(p[prev]) >= (s + sectors))
++			len = sectors;
++		else
++			len = BB_END(p[prev]) - s;
++
++		if (set == 0) {
++			*first_bad = BB_OFFSET(p[prev]);
++			*bad_sectors = BB_LEN(p[prev]);
++			set = 1;
++		}
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		goto update_sectors;
++	}
++
++	/* not cover any badblocks range in the table */
++	len = sectors;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_check;
++
++	WARN_ON(sectors < 0);
++
++	if (unacked_badblocks > 0)
++		rv = -1;
++	else if (acked_badblocks > 0)
++		rv = 1;
++	else
++		rv = 0;
++
++	if (read_seqretry(&bb->lock, seq))
++		goto retry;
++
++	return rv;
++}
+ 
+ /**
+  * badblocks_check() - check a given range for bad sectors
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
new file mode 100644
index 0000000..17b7597
--- /dev/null
+++ b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
@@ -0,0 +1,365 @@
+From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH v4 6/6] badblocks: switch to the improved badblock handling
+ code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in  _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index f16c54925275..4838750811ca 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1389,75 +1389,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 			sector_t *first_bad, int *bad_sectors)
+ {
+-	int hi;
+-	int lo;
+-	u64 *p = bb->page;
+-	int rv;
+-	sector_t target = s + sectors;
+-	unsigned seq;
+-
+-	if (bb->shift > 0) {
+-		/* round the start down, and the end up */
+-		s >>= bb->shift;
+-		target += (1<<bb->shift) - 1;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-	/* 'target' is now the first block after the bad range */
+-
+-retry:
+-	seq = read_seqbegin(&bb->lock);
+-	lo = 0;
+-	rv = 0;
+-	hi = bb->count;
+-
+-	/* Binary search between lo and hi for 'target'
+-	 * i.e. for the last range that starts before 'target'
+-	 */
+-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+-	 * are known not to be the last range before target.
+-	 * VARIANT: hi-lo is the number of possible
+-	 * ranges, and decreases until it reaches 1
+-	 */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			/* This could still be the one, earlier ranges
+-			 * could not.
+-			 */
+-			lo = mid;
+-		else
+-			/* This and later ranges are definitely out. */
+-			hi = mid;
+-	}
+-	/* 'lo' might be the last that started before target, but 'hi' isn't */
+-	if (hi > lo) {
+-		/* need to check all range that end after 's' to see if
+-		 * any are unacknowledged.
+-		 */
+-		while (lo >= 0 &&
+-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+-			if (BB_OFFSET(p[lo]) < target) {
+-				/* starts before the end, and finishes after
+-				 * the start, so they must overlap
+-				 */
+-				if (rv != -1 && BB_ACK(p[lo]))
+-					rv = 1;
+-				else
+-					rv = -1;
+-				*first_bad = BB_OFFSET(p[lo]);
+-				*bad_sectors = BB_LEN(p[lo]);
+-			}
+-			lo--;
+-		}
+-	}
+-
+-	if (read_seqretry(&bb->lock, seq))
+-		goto retry;
+-
+-	return rv;
++	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+@@ -1479,154 +1411,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 			int acknowledged)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	int rv = 0;
+-	unsigned long flags;
+-
+-	if (bb->shift < 0)
+-		/* badblocks are disabled */
+-		return 1;
+-
+-	if (bb->shift) {
+-		/* round the start down, and the end up */
+-		sector_t next = s + sectors;
+-
+-		s >>= bb->shift;
+-		next += (1<<bb->shift) - 1;
+-		next >>= bb->shift;
+-		sectors = next - s;
+-	}
+-
+-	write_seqlock_irqsave(&bb->lock, flags);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts at-or-before 's' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a <= s)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo && BB_OFFSET(p[lo]) > s)
+-		hi = lo;
+-
+-	if (hi > lo) {
+-		/* we found a range that might merge with the start
+-		 * of our new range
+-		 */
+-		sector_t a = BB_OFFSET(p[lo]);
+-		sector_t e = a + BB_LEN(p[lo]);
+-		int ack = BB_ACK(p[lo]);
+-
+-		if (e >= s) {
+-			/* Yes, we can merge with a previous range */
+-			if (s == a && s + sectors >= e)
+-				/* new range covers old */
+-				ack = acknowledged;
+-			else
+-				ack = ack && acknowledged;
+-
+-			if (e < s + sectors)
+-				e = s + sectors;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[lo] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				/* does not all fit in one range,
+-				 * make p[lo] maximal
+-				 */
+-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
+-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-		}
+-	}
+-	if (sectors && hi < bb->count) {
+-		/* 'hi' points to the first range that starts after 's'.
+-		 * Maybe we can merge with the start of that range
+-		 */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		sector_t e = a + BB_LEN(p[hi]);
+-		int ack = BB_ACK(p[hi]);
+-
+-		if (a <= s + sectors) {
+-			/* merging is possible */
+-			if (e <= s + sectors) {
+-				/* full overlap */
+-				e = s + sectors;
+-				ack = acknowledged;
+-			} else
+-				ack = ack && acknowledged;
+-
+-			a = s;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[hi] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-			lo = hi;
+-			hi++;
+-		}
+-	}
+-	if (sectors == 0 && hi < bb->count) {
+-		/* we might be able to combine lo and hi */
+-		/* Note: 's' is at the end of 'lo' */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		int lolen = BB_LEN(p[lo]);
+-		int hilen = BB_LEN(p[hi]);
+-		int newlen = lolen + hilen - (s - a);
+-
+-		if (s >= a && newlen < BB_MAX_LEN) {
+-			/* yes, we can combine them */
+-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+-			memmove(p + hi, p + hi + 1,
+-				(bb->count - hi - 1) * 8);
+-			bb->count--;
+-		}
+-	}
+-	while (sectors) {
+-		/* didn't merge (it all).
+-		 * Need to add a range just before 'hi'
+-		 */
+-		if (bb->count >= MAX_BADBLOCKS) {
+-			/* No room for more */
+-			rv = 1;
+-			break;
+-		} else {
+-			int this_sectors = sectors;
+-
+-			memmove(p + hi + 1, p + hi,
+-				(bb->count - hi) * 8);
+-			bb->count++;
+-
+-			if (this_sectors > BB_MAX_LEN)
+-				this_sectors = BB_MAX_LEN;
+-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+-			sectors -= this_sectors;
+-			s += this_sectors;
+-		}
+-	}
+-
+-	bb->changed = 1;
+-	if (!acknowledged)
+-		bb->unacked_exist = 1;
+-	else
+-		badblocks_update_acked(bb);
+-	write_sequnlock_irqrestore(&bb->lock, flags);
+-
+-	return rv;
++	return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+ 
+@@ -1646,96 +1431,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+  */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	sector_t target = s + sectors;
+-	int rv = 0;
+-
+-	if (bb->shift > 0) {
+-		/* When clearing we round the start up and the end down.
+-		 * This should not matter as the shift should align with
+-		 * the block size and no rounding should ever be needed.
+-		 * However it is better the think a block is bad when it
+-		 * isn't than to think a block is not bad when it is.
+-		 */
+-		s += (1<<bb->shift) - 1;
+-		s >>= bb->shift;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-
+-	write_seqlock_irq(&bb->lock);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts before 'target' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo) {
+-		/* p[lo] is the last range that could overlap the
+-		 * current range.  Earlier ranges could also overlap,
+-		 * but only this one can overlap the end of the range.
+-		 */
+-		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+-		    (BB_OFFSET(p[lo]) < target)) {
+-			/* Partial overlap, leave the tail of this range */
+-			int ack = BB_ACK(p[lo]);
+-			sector_t a = BB_OFFSET(p[lo]);
+-			sector_t end = a + BB_LEN(p[lo]);
+-
+-			if (a < s) {
+-				/* we need to split this range */
+-				if (bb->count >= MAX_BADBLOCKS) {
+-					rv = -ENOSPC;
+-					goto out;
+-				}
+-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+-				bb->count++;
+-				p[lo] = BB_MAKE(a, s-a, ack);
+-				lo++;
+-			}
+-			p[lo] = BB_MAKE(target, end - target, ack);
+-			/* there is no longer an overlap */
+-			hi = lo;
+-			lo--;
+-		}
+-		while (lo >= 0 &&
+-		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+-		       (BB_OFFSET(p[lo]) < target)) {
+-			/* This range does overlap */
+-			if (BB_OFFSET(p[lo]) < s) {
+-				/* Keep the early parts of this range. */
+-				int ack = BB_ACK(p[lo]);
+-				sector_t start = BB_OFFSET(p[lo]);
+-
+-				p[lo] = BB_MAKE(start, s - start, ack);
+-				/* now low doesn't overlap, so.. */
+-				break;
+-			}
+-			lo--;
+-		}
+-		/* 'lo' is strictly before, 'hi' is strictly after,
+-		 * anything between needs to be discarded
+-		 */
+-		if (hi - lo > 1) {
+-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+-			bb->count -= (hi - lo - 1);
+-		}
+-	}
+-
+-	badblocks_update_acked(bb);
+-	bb->changed = 1;
+-out:
+-	write_sequnlock_irq(&bb->lock);
+-	return rv;
++	return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+ 
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch
index 091d4d3..c354234 100644
--- a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch
+++ b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch
@@ -255,19 +255,19 @@ index 0000000..ca52647
 + *
 + * When the caller of badblocks_set() wants to set a range of bad blocks, the
 + * setting range can be acked or unacked. And the setting range may merge,
-+ * overwrite, skip the overlaypped already set range, depends on who they are
++ * overwrite, skip the overlapped already set range, depends on who they are
 + * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
 + * more complicated when the setting range covers multiple already set bad block
-+ * ranges, with restritctions of maximum length of each bad range and the bad
++ * ranges, with restrictions of maximum length of each bad range and the bad
 + * table space limitation.
 + *
-+ * It is difficut and unnecessary to take care of all the possible situations,
++ * It is difficult and unnecessary to take care of all the possible situations,
 + * for setting a large range of bad blocks, we can handle it by dividing the
 + * large range into smaller ones when encounter overlap, max range length or
 + * bad table full conditions. Every time only a smaller piece of the bad range
 + * is handled with a limited number of conditions how it is interacted with
 + * possible overlapped or adjacent already set bad block ranges. Then the hard
-+ * complicated problem can be much simpler to habndle in proper way.
++ * complicated problem can be much simpler to handle in proper way.
 + *
 + * When setting a range of bad blocks to the bad table, the simplified situations
 + * to be considered are, (The already set bad blocks ranges are naming with
@@ -301,12 +301,12 @@ index 0000000..ca52647
 + *        +-------------+
 + *        |      S      |
 + *        +-------------+
-+ * 2.1.2) If S is uncked setting and E is acked, the setting will be dinied, and
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be dinied, and
 + *    the result is,
 + *        +-------------+
 + *        |      E      |
 + *        +-------------+
-+ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E.
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
 + *    An extra slot from the bad blocks table will be allocated for S, and head
 + *    of E will move to end of the inserted range E. The result is,
 + *        +--------+----+
@@ -318,12 +318,12 @@ index 0000000..ca52647
 + *        +-------------+
 + *        |      S      |
 + *        +-------------+
-+ * 2.2.2) If S is uncked setting and E is acked, the setting will be dinied, and
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be dinied, and
 + *    the result is,
 + *        +-------------+
 + *        |      E      |
 + *        +-------------+
-+ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
 +      bad blocks range E. The result is,
 + *        +-------------+
 + *        |      S      |
@@ -378,7 +378,7 @@ index 0000000..ca52647
 + *        +-------------+
 + *        |      E      |
 + *        +-------------+
-+ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
 + *    they will be merged into one, the result is,
 + *        +-------------+
 + *        |      S      |
@@ -401,7 +401,7 @@ index 0000000..ca52647
 + *        +--------------+
 + *        |       E      |
 + *        +--------------+
-+ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked),
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
 + *    they will be merged into one, the result is,
 + *        +--------------+
 + *        |       S      |
@@ -504,7 +504,7 @@ index 0000000..ca52647
 + *              +-------+-------------+                   E1:       1
 + *              |   E1  |    E2       |                   E2:       0
 + *              +-------+-------------+
-+ *     With previosu simplified routines, after overwiting part of E2 with S,
++ *     With previous simplified routines, after overwriting part of E2 with S,
 + *     the bad blocks table should be (E3 is remaining part of E2 which is not
 + *     overwritten by S),
 + *                                                             acknowledged
@@ -514,7 +514,7 @@ index 0000000..ca52647
 + *     The above result is correct but not perfect. Range E1 and S in the bad
 + *     blocks table are all acked, merging them into a larger one range may
 + *     occupy less bad blocks table space and make badblocks_check() faster.
-+ *     Therefore in such situation, after overwiting range S, the previous range
++ *     Therefore in such situation, after overwriting range S, the previous range
 + *     E1 should be checked for possible front combination. Then the ideal
 + *     result can be,
 + *              +----------------+----+                        acknowledged
@@ -533,7 +533,7 @@ index 0000000..ca52647
 + *               +-------+
 + *               |   E   |
 + *               +-------+
-+ *    For the above special stiuation, when the setting range S are all handled
++ *    For the above special situation, when the setting range S are all handled
 + *    and the loop ends, an extra check is necessary for whether next already
 + *    set range E is right after S and mergeable.
 + * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
@@ -543,7 +543,7 @@ index 0000000..ca52647
 + *        |       S      |
 + *        +--------------+
 + * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
-+ *     S infront of the already set range E in the bad blocks table. The result
++ *     S in front of the already set range E in the bad blocks table. The result
 + *     is,
 + *        +------+-------+
 + *        |  S   |   E   |
@@ -567,9 +567,9 @@ index 0000000..ca52647
 + *
 + * Clearing a bad blocks range from the bad block table has similar idea as
 + * setting does, but much more simpler. The only thing needs to be noticed is
-+ * when the clearning range hits middle of a bad block range, the existing bad
++ * when the clearing range hits middle of a bad block range, the existing bad
 + * block range will split into two, and one more item should be added into the
-+ * bad block table. The simplified situations to beconsidered are, (The already
++ * bad block table. The simplified situations to be considered are, (The already
 + * set bad blocks ranges in bad block table are naming with prefix E, and the
 + * clearing bad blocks range is naming with prefix C)
 + *
diff --git a/for-test/badblocks/v5/v5-0000-cover-letter.patch b/for-test/badblocks/v5/v5-0000-cover-letter.patch
new file mode 100644
index 0000000..efd498c
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0000-cover-letter.patch
@@ -0,0 +1,70 @@
+From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 10 Dec 2021 15:27:33 +0800
+Subject: [PATCH v5 0/7] badblocks improvement for multiple bad block ranges
+
+Hi folks,
+
+This is the v5 effort to improve badblocks code APIs to handle multiple
+ranges in bad block table.
+
+Comparing to previous v4 series, the changes in v5 series include,
+- Typos in code comments which are pointed out by Geliang Tang and
+  Wols Lists.
+- Drop extra local variables in helper routines which suggested by
+  Geliang Tang.
+- Change the user space testing code with all above changes.
+
+There is NO in-memory or on-disk format change in the whole series, all
+existing API and data structures are consistent. This series just only
+improve the code algorithm to handle more corner cases, the interfaces
+are same and consistency to all existing callers (md raid and nvdimm
+drivers).
+
+The original motivation of the change is from the requirement from our
+customer, that current badblocks routines don't handle multiple ranges.
+For example if the bad block setting range covers multiple ranges from
+bad block table, only the first two bad block ranges merged and rested
+ranges are intact. The expected behavior should be all the covered
+ranges to be handled.
+
+All the patches are tested by modified user space code and the code
+logic works as expected. The modified user space testing code is
+provided in the last patch. The testing code is an example how the
+improved code is tested.
+
+The whole change is divided into 6 patches to make the code review more
+clear and easier. If people prefer, I'd like to post a single large
+patch finally after the code review accomplished.
+
+Please review the code and response. Thank you all in advance.
+
+Coly Li
+
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+Cc: Wols Lists <antlists@youngman.org.uk>
+---
+
+Coly Li (6):
+  badblocks: add more helper structure and routines in badblocks.h
+  badblocks: add helper routines for badblock ranges handling
+  badblocks: improve badblocks_set() for multiple ranges handling
+  badblocks: improve badblocks_clear() for multiple ranges handling
+  badblocks: improve badblocks_check() for multiple ranges handling
+  badblocks: switch to the improved badblock handling code
+Coly Li (1):
+  test: user space code to test badblocks APIs
+
+ block/badblocks.c         | 1604 ++++++++++++++++++++++++++++++-------
+ include/linux/badblocks.h |   30 +
+ 2 files changed, 1339 insertions(+), 295 deletions(-)
+
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch
new file mode 100644
index 0000000..d66b0c8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch
@@ -0,0 +1,91 @@
+From d5352d6d537923232aa274cc753366a7851a1f13 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:29:38 +0800
+Subject: [PATCH v5 1/6] badblocks: add more helper structure and routines in
+ badblocks.h
+
+This patch adds the following helper structure and routines into
+badblocks.h,
+- struct badblocks_context
+  This structure is used in improved badblocks code for bad table
+  iteration.
+- BB_END()
+  The macro to calculate end LBA of a bad range record from bad
+  table.
+- badblocks_full() and badblocks_empty()
+  The inline routines to check whether bad table is full or empty.
+- set_changed() and clear_changed()
+  The inline routines to set and clear 'changed' tag from struct
+  badblocks.
+
+These new helper structure and routines can help to make the code more
+clear, they will be used in the improved badblocks code in following
+patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
+index 2426276b9bd3..670f2dae692f 100644
+--- a/include/linux/badblocks.h
++++ b/include/linux/badblocks.h
+@@ -15,6 +15,7 @@
+ #define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
+ #define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
+ #define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
++#define BB_END(x)	(BB_OFFSET(x) + BB_LEN(x))
+ #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+ 
+ /* Bad block numbers are stored sorted in a single page.
+@@ -41,6 +42,12 @@ struct badblocks {
+ 	sector_t size;		/* in sectors */
+ };
+ 
++struct badblocks_context {
++	sector_t	start;
++	sector_t	len;
++	int		ack;
++};
++
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 		   sector_t *first_bad, int *bad_sectors);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+ 	}
+ 	badblocks_exit(bb);
+ }
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++	return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++	return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++	if (bb->changed != 1)
++		bb->changed = 1;
++}
++
++static inline void clear_changed(struct badblocks *bb)
++{
++	if (bb->changed != 0)
++		bb->changed = 0;
++}
++
+ #endif
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
new file mode 100644
index 0000000..fc084aa
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch
@@ -0,0 +1,459 @@
+From 2accaa280961524bc5eea98399906d199eea2568 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 17:16:57 +0800
+Subject: [PATCH v5 2/6] badblocks: add helper routines for badblock ranges
+ handling
+
+This patch adds several helper routines to improve badblock ranges
+handling. These helper routines will be used later in the improved
+version of badblocks_set()/badblocks_clear()/badblocks_check().
+
+- Helpers prev_by_hint() and prev_badblocks() are used to find the bad
+  range from bad table which the searching range starts at or after.
+
+- The following helpers are to decide the relative layout between the
+  manipulating range and existing bad block range from bad table.
+  - can_merge_behind()
+    Return 'true' if the manipulating range can backward merge with the
+    bad block range.
+  - can_merge_front()
+    Return 'true' if the manipulating range can forward merge with the
+    bad block range.
+  - can_combine_front()
+    Return 'true' if two adjacent bad block ranges before the
+    manipulating range can be merged.
+  - overlap_front()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range in front of its range.
+  - overlap_behind()
+    Return 'true' if the manipulating range exactly overlaps with the
+    bad block range behind its range.
+  - can_front_overwrite()
+    Return 'true' if the manipulating range can forward overwrite the
+    bad block range in front of its range.
+
+- The following helpers are to add the manipulating range into the bad
+  block table. Different routine is called with the specific relative
+  layout between the manipulating range and other bad block range in the
+  bad block table.
+  - behind_merge()
+    Merge the manipulating range with the bad block range behind its
+    range, and return the number of merged length in unit of sector.
+  - front_merge()
+    Merge the manipulating range with the bad block range in front of
+    its range, and return the number of merged length in unit of sector.
+  - front_combine()
+    Combine the two adjacent bad block ranges before the manipulating
+    range into a larger one.
+  - front_overwrite()
+    Overwrite partial of whole bad block range which is in front of the
+    manipulating range. The overwrite may split existing bad block range
+    and generate more bad block ranges into the bad block table.
+  - insert_at()
+    Insert the manipulating range at a specific location in the bad
+    block table.
+
+All the above helpers are used in later patches to improve the bad block
+ranges handling for badblocks_set()/badblocks_clear()/badblocks_check().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 376 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index d39056630d9c..30958cc4469f 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,382 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++	int hint_end = hint + 2;
++	u64 *p = bb->page;
++	int ret = -1;
++
++	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++	       (BB_OFFSET(p[hint]) <= s)) {
++		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++			ret = hint;
++			break;
++		}
++		hint++;
++	}
++
++	return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++			  int hint)
++{
++	sector_t s = bad->start;
++	int ret = -1;
++	int lo, hi;
++	u64 *p;
++
++	if (!bb->count)
++		goto out;
++
++	if (hint >= 0) {
++		ret = prev_by_hint(bb, s, hint);
++		if (ret >= 0)
++			goto out;
++	}
++
++	lo = 0;
++	hi = bb->count;
++	p = bb->page;
++
++	while (hi - lo > 1) {
++		int mid = (lo + hi)/2;
++		sector_t a = BB_OFFSET(p[mid]);
++
++		if (a <= s)
++			lo = mid;
++		else
++			hi = mid;
++	}
++
++	if (BB_OFFSET(p[lo]) <= s)
++		ret = lo;
++out:
++	return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++			     int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++
++	if ((s <= BB_OFFSET(p[behind])) &&
++	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
++	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++	    BB_ACK(p[behind]) == bad->ack)
++		return true;
++	return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++			int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_OFFSET(p[behind]));
++	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++	if (s < BB_OFFSET(p[behind])) {
++		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++		merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
++	} else {
++		merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++	}
++
++	WARN_ON(merged == 0);
++
++	return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++			    struct badblocks_context *bad)
++{
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++
++	if (BB_ACK(p[prev]) == bad->ack &&
++	    (s < BB_END(p[prev]) ||
++	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++		return true;
++	return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_END(p[prev]));
++
++	if (s < BB_END(p[prev])) {
++		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++	} else {
++		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++		if ((prev + 1) < bb->count &&
++		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++		}
++
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + merged, bad->ack);
++	}
++
++	return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++			      struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if ((prev > 0) &&
++	    (BB_OFFSET(p[prev]) == bad->start) &&
++	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++		return true;
++	return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++	u64 *p = bb->page;
++
++	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++			      BB_ACK(p[prev]));
++	if ((prev + 1) < bb->count)
++		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++			  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if (bad->start >= BB_OFFSET(p[front]) &&
++	    bad->start < BB_END(p[front]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++			   int behind)
++{
++	u64 *p = bb->page;
++
++	if (bad->start < BB_OFFSET(p[behind]) &&
++	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ *    range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ *    range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ *    extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ *    ranges will be split (ahead and after the overwritten range) and
++ *    added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++				struct badblocks_context *bad, int *extra)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(!overlap_front(bb, prev, bad));
++
++	if (BB_ACK(p[prev]) >= bad->ack)
++		return false;
++
++	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++		len = BB_END(p[prev]) - bad->start;
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 0;
++		else
++			*extra = 1;
++
++		bad->len = len;
++	} else {
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 1;
++		else
++		/*
++		 * prev range will be split into two, beside the overwritten
++		 * one, an extra slot needed from bad table.
++		 */
++			*extra = 2;
++	}
++
++	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++		return false;
++
++	return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++			   struct badblocks_context *bad, int extra)
++{
++	u64 *p = bb->page;
++	sector_t orig_end = BB_END(p[prev]);
++	int orig_ack = BB_ACK(p[prev]);
++
++	switch (extra) {
++	case 0:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++				  bad->ack);
++		break;
++	case 1:
++		if (BB_OFFSET(p[prev]) == bad->start) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->len, bad->ack);
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start + bad->len,
++					      orig_end - BB_END(p[prev]),
++					      orig_ack);
++		} else {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->start - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++			/*
++			 * prev +2 -> prev + 1 + 1, which is for,
++			 * 1) prev + 1: the slot index of the previous one
++			 * 2) + 1: one more slot for extra being 1.
++			 */
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		}
++		break;
++	case 2:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  bad->start - BB_OFFSET(p[prev]),
++				  BB_ACK(p[prev]));
++		/*
++		 * prev + 3 -> prev + 1 + 2, which is for,
++		 * 1) prev + 1: the slot index of the previous one
++		 * 2) + 2: two more slots for extra being 2.
++		 */
++		memmove(p + prev + 3, p + prev + 1,
++			(bb->count - prev - 1) * 8);
++		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++				      orig_end - BB_END(p[prev + 1]),
++				      BB_ACK(p[prev]));
++		break;
++	default:
++		break;
++	}
++
++	return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(badblocks_full(bb));
++
++	len = min_t(sector_t, bad->len, BB_MAX_LEN);
++	if (at < bb->count)
++		memmove(p + at + 1, p + at, (bb->count - at) * 8);
++	p[at] = BB_MAKE(bad->start, len, bad->ack);
++
++	return len;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch
new file mode 100644
index 0000000..d5e7ce8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch
@@ -0,0 +1,663 @@
+From cdb864aa796ef2e65a99561b50561c7beec8ab58 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 15:57:50 +0800
+Subject: [PATCH v5 3/6] badblocks: improve badblocks_set() for multiple ranges
+ handling
+
+Recently I received a bug report that current badblocks code does not
+properly handle multiple ranges. For example,
+        badblocks_set(bb, 32, 1, true);
+        badblocks_set(bb, 34, 1, true);
+        badblocks_set(bb, 36, 1, true);
+        badblocks_set(bb, 32, 12, true);
+Then indeed badblocks_show() reports,
+        32 3
+        36 1
+But the expected bad blocks table should be,
+        32 12
+Obviously only the first 2 ranges are merged and badblocks_set() returns
+and ignores the rest setting range.
+
+This behavior is improper, if the caller of badblocks_set() wants to set
+a range of blocks into bad blocks table, all of the blocks in the range
+should be handled even the previous part encountering failure.
+
+The desired way to set bad blocks range by badblocks_set() is,
+- Set as many as blocks in the setting range into bad blocks table.
+- Merge the bad blocks ranges and occupy as less as slots in the bad
+  blocks table.
+- Fast.
+
+Indeed the above proposal is complicated, especially with the following
+restrictions,
+- The setting bad blocks range can be acknowledged or not acknowledged.
+- The bad blocks table size is limited.
+- Memory allocation should be avoided.
+
+The basic idea of the patch is to categorize all possible bad blocks
+range setting combinations into to much less simplified and more less
+special conditions. Inside badblocks_set() there is an implicit loop
+composed by jumping between labels 're_insert' and 'update_sectors'. No
+matter how large the setting bad blocks range is, in every loop just a
+minimized range from the head is handled by a pre-defined behavior from
+one of the categorized conditions. The logic is simple and code flow is
+manageable.
+
+The different relative layout between the setting range and existing bad
+block range are checked and handled (merge, combine, overwrite, insert)
+by the helpers in previous patch. This patch is to make all the helpers
+work together with the above idea.
+
+This patch only has the algorithm improvement for badblocks_set(). There
+are following patches contain improvement for badblocks_clear() and
+badblocks_check(). But the algorithm in badblocks_set() is fundamental
+and typical, other improvement in clear and check routines are based on
+all the helpers and ideas in this patch.
+
+In order to make the change to be more clear for code review, this patch
+does not directly modify existing badblocks_set(), and just add a new
+one named _badblocks_set(). Later patch will remove current existing
+badblocks_set() code and make it as a wrapper of _badblocks_set(). So
+the new added change won't be mixed with deleted code, the code review
+can be easier.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+Cc: Wols Lists <antlists@youngman.org.uk>
+---
+ block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 540 insertions(+), 20 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 30958cc4469f..f45f82646bb7 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -16,6 +16,322 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ *  prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ *    block range.
++ *                         +--------+
++ *                         |    S   |
++ *                         +--------+
++ *        +-------------+               +-------------+
++ *        |      E1     |               |      E2     |
++ *        +-------------+               +-------------+
++ *    For this situation if the bad blocks table is not full, just allocate a
++ *    free slot from the bad blocks table to mark the setting range S. The
++ *    result is,
++ *        +-------------+  +--------+   +-------------+
++ *        |      E1     |  |    S   |   |      E2     |
++ *        +-------------+  +--------+   +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ *    range.
++ * 2.1) The setting range size < already set range size
++ *        +--------+
++ *        |    S   |
++ *        +--------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ *    An extra slot from the bad blocks table will be allocated for S, and head
++ *    of E will move to end of the inserted range S. The result is,
++ *        +--------+----+
++ *        |    S   | E  |
++ *        +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++      bad blocks range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.3) The setting range size > already set range size
++ *        +-------------------+
++ *        |          S        |
++ *        +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For such situation, the setting range S can be treated as two parts, the
++ *    first part (S1) is as same size as the already set range E, the second
++ *    part (S2) is the rest of setting range.
++ *        +-------------+-----+        +-------------+       +-----+
++ *        |    S1       | S2  |        |     S1      |       | S2  |
++ *        +-------------+-----+  ===>  +-------------+       +-----+
++ *        +-------------+              +-------------+
++ *        |      E      |              |      E      |
++ *        +-------------+              +-------------+
++ *    Now we only focus on how to handle the setting range S1 and already set
++ *    range E, which are already explained in 2.2), for the rest S2 it will be
++ *    handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ *    range.
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ *             +-------------+
++ *             |      E      |
++ *             +-------------+
++ *    For this situation, the setting range S can be divided into two parts, the
++ *    first (S1) ends at the start LBA of already set range E, the second part
++ *    (S2) starts exactly at a start LBA of the already set range E.
++ *        +----+---------+             +----+      +---------+
++ *        | S1 |    S2   |             | S1 |      |    S2   |
++ *        +----+---------+      ===>   +----+      +---------+
++ *             +-------------+                     +-------------+
++ *             |      E      |                     |      E      |
++ *             +-------------+                     +-------------+
++ *    Now only the first part S1 should be handled in this loop, which is in
++ *    similar condition as 1). The rest part S2 has exact same start LBA address
++ *    of the already set range E, they will be handled in next loop in one of
++ *    situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ *    range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ *    blocks range E, like the following chart shows,
++ *            +---------+
++ *            |   S     |
++ *            +---------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ *    overwrite the overlapped range of E, the result is,
++ *        +---+---------+
++ *        | E |    S    |
++ *        +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ *    the following chart shows,
++ *             +----+
++ *             | S  |
++ *             +----+
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is also,
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ *    inserted into middle of E and split previous range E into twp parts (E1
++ *    and E2), the result is,
++ *        +----+----+----+
++ *        | E1 |  S | E2 |
++ *        +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ *    blocks range E. The range S starts after the start LBA of range E, and
++ *    ends after the end LBA of range E, as the following chart shows,
++ *            +-------------------+
++ *            |          S        |
++ *            +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For this situation the range S can be divided into two parts, the first
++ *    part (S1) ends at end range E, and the second part (S2) has rest range of
++ *    origin S.
++ *            +---------+---------+            +---------+      +---------+
++ *            |    S1   |    S2   |            |    S1   |      |    S2   |
++ *            +---------+---------+  ===>      +---------+      +---------+
++ *        +-------------+                  +-------------+
++ *        |      E      |                  |      E      |
++ *        +-------------+                  +-------------+
++ *     Now in this loop the setting range S1 and already set range E can be
++ *     handled as the situations 4), the rest range S2 will be handled in next
++ *     loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ *    blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ *    range S and they are adjacent,
++ *                +------+
++ *                |  S   |
++ *                +------+
++ *        +-------+
++ *        |   E   |
++ *        +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can front merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ *    range S right after already set range E into the bad blocks table. The
++ *    result is,
++ *        +--------+------+
++ *        |   E    |   S  |
++ *        +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ *        +-------------------------------------------------------+
++ *        |                           S                           |
++ *        +-------------------------------------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+     +-----+   +-----+
++ *                                 | E1  |     | E2  |   | E3  |
++ *                                 +-----+     +-----+   +-----+
++ *     In the above example, when the bad blocks table is full, inserting the
++ *     first part of setting range S will fail because no more available slot
++ *     can be allocated from bad blocks table. In this situation a proper
++ *     setting method should be go though all the setting bad blocks range and
++ *     look for chance to merge already set ranges into less ones. When there
++ *     is available slot from bad blocks table, re-try again to handle more
++ *     setting bad blocks ranges as many as possible.
++ *        +------------------------+
++ *        |          S3            |
++ *        +------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+-----+-----+---+-----+--+
++ *                                 |       S1        |     S2     |
++ *                                 +-----+-----+-----+---+-----+--+
++ *     The above chart shows although the first part (S3) cannot be inserted due
++ *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ *     can be merged with rest part of S into less range S1 and S2. Now there is
++ *     1 free slot in bad blocks table.
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *        |           S3           |       S1        |     S2     |
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *     Since the bad blocks table is not full anymore, re-try again for the
++ *     origin setting range S. Now the setting range S3 can be inserted into the
++ *     bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ *    In the following example, in bad blocks table, E1 is an acked bad blocks
++ *    range and E2 is an unacked bad blocks range, therefore they are not able
++ *    to merge into a larger range. The setting bad blocks range S is acked,
++ *    therefore part of E2 can be overwritten by S.
++ *                      +--------+
++ *                      |    S   |                             acknowledged
++ *                      +--------+                         S:       1
++ *              +-------+-------------+                   E1:       1
++ *              |   E1  |    E2       |                   E2:       0
++ *              +-------+-------------+
++ *     With previous simplified routines, after overwriting part of E2 with S,
++ *     the bad blocks table should be (E3 is remaining part of E2 which is not
++ *     overwritten by S),
++ *                                                             acknowledged
++ *              +-------+--------+----+                    S:       1
++ *              |   E1  |    S   | E3 |                   E1:       1
++ *              +-------+--------+----+                   E3:       0
++ *     The above result is correct but not perfect. Range E1 and S in the bad
++ *     blocks table are all acked, merging them into a larger one range may
++ *     occupy less bad blocks table space and make badblocks_check() faster.
++ *     Therefore in such situation, after overwriting range S, the previous range
++ *     E1 should be checked for possible front combination. Then the ideal
++ *     result can be,
++ *              +----------------+----+                        acknowledged
++ *              |       E1       | E3 |                   E1:       1
++ *              +----------------+----+                   E3:       0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ *    range S and they are adjacent. Normally we don't need to care about this
++ *    because front merge handles this while going though range S from head to
++ *    tail, except for the tail part of range S. When the setting range S are
++ *    fully handled, all the above simplified routine doesn't check whether the
++ *    tail LBA of range S is adjacent to the next already set range and not able
++ *    to them if they are mergeable.
++ *        +------+
++ *        |  S   |
++ *        +------+
++ *               +-------+
++ *               |   E   |
++ *               +-------+
++ *    For the above special situation, when the setting range S are all handled
++ *    and the loop ends, an extra check is necessary for whether next already
++ *    set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can behind merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ *     S in front of the already set range E in the bad blocks table. The result
++ *     is,
++ *        +------+-------+
++ *        |  S   |   E   |
++ *        +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ */
++
+ /*
+  * Find the range starts at-or-before 's' from bad table. The search
+  * starts from index 'hint' and stops at index 'hint_end' from the bad
+@@ -392,6 +708,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad
+ 	return len;
+ }
+ 
++static void badblocks_update_acked(struct badblocks *bb)
++{
++	bool unacked = false;
++	u64 *p = bb->page;
++	int i;
++
++	if (!bb->unacked_exist)
++		return;
++
++	for (i = 0; i < bb->count ; i++) {
++		if (!BB_ACK(p[i])) {
++			unacked = true;
++			break;
++		}
++	}
++
++	if (!unacked)
++		bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++			  int acknowledged)
++{
++	int retried = 0, space_desired = 0;
++	int orig_len, len = 0, added = 0;
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	sector_t orig_start;
++	unsigned long flags;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		/* round the start down, and the end up */
++		sector_t next = s + sectors;
++
++		rounddown(s, bb->shift);
++		roundup(next, bb->shift);
++		sectors = next - s;
++	}
++
++	write_seqlock_irqsave(&bb->lock, flags);
++
++	orig_start = s;
++	orig_len = sectors;
++	bad.ack = acknowledged;
++	p = bb->page;
++
++re_insert:
++	bad.start = s;
++	bad.len = sectors;
++	len = 0;
++
++	if (badblocks_empty(bb)) {
++		len = insert_at(bb, 0, &bad);
++		bb->count++;
++		added++;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start before all badblocks */
++	if (prev < 0) {
++		if (!badblocks_full(bb)) {
++			/* insert on the first */
++			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++				bad.len = BB_OFFSET(p[0]) - bad.start;
++			len = insert_at(bb, 0, &bad);
++			bb->count++;
++			added++;
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* No sapce, try to merge */
++		if (overlap_behind(bb, &bad, 0)) {
++			if (can_merge_behind(bb, &bad, 0)) {
++				len = behind_merge(bb, &bad, 0);
++				added++;
++			} else {
++				len = min_t(sector_t,
++					    BB_OFFSET(p[0]) - s, sectors);
++				space_desired = 1;
++			}
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* no table space and give up */
++		goto out;
++	}
++
++	/* in case p[prev-1] can be merged with p[prev] */
++	if (can_combine_front(bb, prev, &bad)) {
++		front_combine(bb, prev);
++		bb->count--;
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (can_merge_front(bb, prev, &bad)) {
++			len = front_merge(bb, prev, &bad);
++			added++;
++		} else {
++			int extra = 0;
++
++			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++				len = min_t(sector_t,
++					    BB_END(p[prev]) - s, sectors);
++				hint = prev;
++				goto update_sectors;
++			}
++
++			len = front_overwrite(bb, prev, &bad, extra);
++			added++;
++			bb->count += extra;
++
++			if (can_combine_front(bb, prev, &bad)) {
++				front_combine(bb, prev);
++				bb->count--;
++			}
++		}
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (can_merge_front(bb, prev, &bad)) {
++		len = front_merge(bb, prev, &bad);
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	/* if no space in table, still try to merge in the covered range */
++	if (badblocks_full(bb)) {
++		/* skip the cannot-merge range */
++		if (((prev + 1) < bb->count) &&
++		    overlap_behind(bb, &bad, prev + 1) &&
++		    ((s + sectors) >= BB_END(p[prev + 1]))) {
++			len = BB_END(p[prev + 1]) - s;
++			hint = prev + 1;
++			goto update_sectors;
++		}
++
++		/* no retry any more */
++		len = sectors;
++		space_desired = 1;
++		hint = -1;
++		goto update_sectors;
++	}
++
++	/* cannot merge and there is space in bad table */
++	if ((prev + 1) < bb->count &&
++	    overlap_behind(bb, &bad, prev + 1))
++		bad.len = min_t(sector_t,
++				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++	len = insert_at(bb, prev + 1, &bad);
++	bb->count++;
++	added++;
++	hint = prev + 1;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_insert;
++
++	WARN_ON(sectors < 0);
++
++	/* Check whether the following already set range can be merged */
++	if ((prev + 1) < bb->count &&
++	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++				  BB_ACK(p[prev]));
++
++		if ((prev + 2) < bb->count)
++			memmove(p + prev + 1, p + prev + 2,
++				(bb->count -  (prev + 2)) * 8);
++		bb->count--;
++	}
++
++	if (space_desired && !badblocks_full(bb)) {
++		s = orig_start;
++		sectors = orig_len;
++		space_desired = 0;
++		if (retried++ < 3)
++			goto re_insert;
++	}
++
++out:
++	if (added) {
++		set_changed(bb);
++
++		if (!acknowledged)
++			bb->unacked_exist = 1;
++		else
++			badblocks_update_acked(bb);
++	}
++
++	write_sequnlock_irqrestore(&bb->lock, flags);
++
++	if (!added)
++		rv = 1;
++
++	return rv;
++}
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+@@ -501,26 +1041,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+-static void badblocks_update_acked(struct badblocks *bb)
+-{
+-	u64 *p = bb->page;
+-	int i;
+-	bool unacked = false;
+-
+-	if (!bb->unacked_exist)
+-		return;
+-
+-	for (i = 0; i < bb->count ; i++) {
+-		if (!BB_ACK(p[i])) {
+-			unacked = true;
+-			break;
+-		}
+-	}
+-
+-	if (!unacked)
+-		bb->unacked_exist = 0;
+-}
+-
+ /**
+  * badblocks_set() - Add a range of bad blocks to the table.
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
new file mode 100644
index 0000000..e3c38b8
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch
@@ -0,0 +1,399 @@
+From ea2a8ebd59b23e8c12febd3bcf5bebea24d63461 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 1 Mar 2021 22:16:10 +0800
+Subject: [PATCH v5 4/6] badblocks: improve badblocks_clear() for multiple
+ ranges handling
+
+With the fundamental ideas and helper routines from badblocks_set()
+improvement, clearing bad block for multiple ranges is much simpler.
+
+With a similar idea from badblocks_set() improvement, this patch
+simplifies bad block range clearing into 5 situations. No matter how
+complicated the clearing condition is, we just look at the head part
+of clearing range with relative already set bad block range from the
+bad block table. The rested part will be handled in next run of the
+while-loop.
+
+Based on existing helpers added from badblocks_set(), this patch adds
+two more helpers,
+- front_clear()
+  Clear the bad block range from bad block table which is front
+  overlapped with the clearing range.
+- front_splitting_clear()
+  Handle the condition that the clearing range hits middle of an
+  already set bad block range from bad block table.
+
+Similar as badblocks_set(), the first part of clearing range is handled
+with relative bad block range which is find by prev_badblocks(). In most
+cases a valid hint is provided to prev_badblocks() to avoid unnecessary
+bad block table iteration.
+
+This patch also explains the detail algorithm code comments at beginning
+of badblocks.c, including which five simplified situations are
+categrized and how all the bad block range clearing conditions are
+handled by these five situations.
+
+Again, in order to make the code review easier and avoid the code
+changes mixed together, this patch does not modify badblock_clear() and
+implement another routine called _badblock_clear() for the improvement.
+Later patch will delete current code of badblock_clear() and make it as
+a wrapper to _badblock_clear(), so the code change can be much clear for
+review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 325 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index f45f82646bb7..3e1bb593a2bb 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -330,6 +330,123 @@
+  * avoided. In my test with the hint to prev_badblocks(), except for the first
+  * loop, all rested calls to prev_badblocks() can go into the fast path and
+  * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ *    table.
++ *    +-----+         |          +-----+         |          +-----+
++ *    |  C  |         |          |  C  |         |          |  C  |
++ *    +-----+         or         +-----+         or         +-----+
++ *            +---+   |   +----+         +----+  |  +---+
++ *            | E |   |   | E1 |         | E2 |  |  | E |
++ *            +---+   |   +----+         +----+  |  +---+
++ *    For the above situations, no bad block to be cleared and no failure
++ *    happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ *    the bad block table.
++ *            +---+
++ *            | C |
++ *            +---+
++ *     +-----------------+
++ *     |         E       |
++ *     +-----------------+
++ *    In this situation if the bad block table is not full, the range E will be
++ *    split into two ranges E1 and E2. The result is,
++ *     +------+   +------+
++ *     |  E1  |   |  E2  |
++ *     +------+   +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ *    from the bad block table.
++ * 3.1) Partially covered at head part
++ *         +------------+
++ *         |     C      |
++ *         +------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation, the overlapped already set range will update the
++ *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ *    item deleted from bad block table. The result is,
++ *                      +----+
++ *                      | E1 |
++ *                      +----+
++ * 3.2) Exact fully covered
++ *         +-----------------+
++ *         |         C       |
++ *         +-----------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation the whole bad blocks range E will be cleared and its
++ *    corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ *    range.
++ *                   +-------+
++ *                   |   C   |
++ *                   +-------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For the above situation, the already set range E is updated to shrink its
++ *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ *    The result is,
++ *         +---------+
++ *         |    E    |
++ *         +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ *    range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ *    range.
++ *         +----------+
++ *         |     C    |
++ *         +----------+
++ *              +------------+
++ *              |      E     |
++ *              +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part ends at the start LBA of range E, and the second part starts at
++ *   same LBA of range E.
++ *         +----+-----+               +----+   +-----+
++ *         | C1 | C2  |               | C1 |   | C2  |
++ *         +----+-----+         ===>  +----+   +-----+
++ *              +------------+                 +------------+
++ *              |      E     |                 |      E     |
++ *              +------------+                 +------------+
++ *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ *   handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ *   range.
++ *                 +----------+
++ *                 |     C    |
++ *                 +----------+
++ *         +------------+
++ *         |      E     |
++ *         +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part C1 ends at same end LBA of range E, and the second part starts
++ *   at end LBA of range E.
++ *                 +----+-----+                 +----+    +-----+
++ *                 | C1 | C2  |                 | C1 |    | C2  |
++ *                 +----+-----+  ===>           +----+    +-----+
++ *         +------------+               +------------+
++ *         |      E     |               |      E     |
++ *         +------------+               +------------+
++ *   Now the first part clearing range C1 can be handled as condition 4), and
++ *   the second part clearing range C2 can be handled as condition 1) in next
++ *   loop.
++ *
++ *   All bad blocks range clearing can be simplified into the above 5 situations
++ *   by only handling the head part of the clearing range in each run of the
++ *   while-loop. The idea is similar to bad blocks range setting but much
++ *   simpler.
+  */
+ 
+ /*
+@@ -932,6 +1049,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 	return rv;
+ }
+ 
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++		       struct badblocks_context *bad, int *deleted)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int cleared = 0;
++
++	*deleted = 0;
++	if (s == BB_OFFSET(p[prev])) {
++		if (BB_LEN(p[prev]) > sectors) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++					  BB_LEN(p[prev]) - sectors,
++					  BB_ACK(p[prev]));
++			cleared = sectors;
++		} else {
++			/* BB_LEN(p[prev]) <= sectors */
++			cleared = BB_LEN(p[prev]);
++			if ((prev + 1) < bb->count)
++				memmove(p + prev, p + prev + 1,
++				       (bb->count - prev - 1) * 8);
++			*deleted = 1;
++		}
++	} else if (s > BB_OFFSET(p[prev])) {
++		if (BB_END(p[prev]) <= (s + sectors)) {
++			cleared = BB_END(p[prev]) - s;
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  s - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++		} else {
++			/* Splitting is handled in front_splitting_clear() */
++			BUG();
++		}
++	}
++
++	return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++				  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	u64 end = BB_END(p[prev]);
++	int ack = BB_ACK(p[prev]);
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++
++	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++			  s - BB_OFFSET(p[prev]),
++			  ack);
++	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++	return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	int len = 0, cleared = 0;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		sector_t target;
++
++		/* When clearing we round the start up and the end down.
++		 * This should not matter as the shift should align with
++		 * the block size and no rounding should ever be needed.
++		 * However it is better the think a block is bad when it
++		 * isn't than to think a block is not bad when it is.
++		 */
++		target = s + sectors;
++		roundup(s, bb->shift);
++		rounddown(target, bb->shift);
++		sectors = target - s;
++	}
++
++	write_seqlock_irq(&bb->lock);
++
++	bad.ack = true;
++	p = bb->page;
++
++re_clear:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* Start before all badblocks */
++	if (prev < 0) {
++		if (overlap_behind(bb, &bad, 0)) {
++			len = BB_OFFSET(p[0]) - s;
++			hint = prev;
++		} else {
++			len = sectors;
++		}
++		/*
++		 * Both situations are to clear non-bad range,
++		 * should be treated as successful
++		 */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Clear will split a bad record but the table is full */
++	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++	    (BB_END(p[prev]) > (bad.start + sectors))) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if ((BB_OFFSET(p[prev]) < bad.start) &&
++		    (BB_END(p[prev]) > (bad.start + bad.len))) {
++			/* Splitting */
++			if ((bb->count + 1) < MAX_BADBLOCKS) {
++				len = front_splitting_clear(bb, prev, &bad);
++				bb->count += 1;
++				cleared++;
++			} else {
++				/* No space to split, give up */
++				len = sectors;
++			}
++		} else {
++			int deleted = 0;
++
++			len = front_clear(bb, prev, &bad, &deleted);
++			bb->count -= deleted;
++			cleared++;
++			hint = prev;
++		}
++
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		/* Clear non-bad range should be treated as successful */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Not cover any badblocks range in the table */
++	len = sectors;
++	/* Clear non-bad range should be treated as successful */
++	cleared++;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_clear;
++
++	WARN_ON(sectors < 0);
++
++	if (cleared) {
++		badblocks_update_acked(bb);
++		set_changed(bb);
++	}
++
++	write_sequnlock_irq(&bb->lock);
++
++	if (!cleared)
++		rv = 1;
++
++	return rv;
++}
++
++
+ /**
+  * badblocks_check() - check a given range for bad sectors
+  * @bb:		the badblocks structure that holds all badblock information
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
new file mode 100644
index 0000000..f7ba71a
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch
@@ -0,0 +1,175 @@
+From 25e6c8d14293c3b45fcf239df7c88e05f1ee70bf Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 2 Dec 2021 16:13:35 +0800
+Subject: [PATCH v5 5/6] badblocks: improve badblocks_check() for multiple
+ ranges handling
+
+This patch rewrites badblocks_check() with similar coding style as
+_badblocks_set() and _badblocks_clear(). The only difference is bad
+blocks checking may handle multiple ranges in bad tables now.
+
+If a checking range covers multiple bad blocks range in bad block table,
+like the following condition (C is the checking range, E1, E2, E3 are
+three bad block ranges in bad block table),
+  +------------------------------------+
+  |                C                   |
+  +------------------------------------+
+    +----+      +----+      +----+
+    | E1 |      | E2 |      | E3 |
+    +----+      +----+      +----+
+The improved badblocks_check() algorithm will divide checking range C
+into multiple parts, and handle them in 7 runs of a while-loop,
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+  |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 |
+  +--+ +----+ +----+ +----+ +----+ +----+ +----+
+       +----+        +----+        +----+
+       | E1 |        | E2 |        | E3 |
+       +----+        +----+        +----+
+And the start LBA and length of range E1 will be set as first_bad and
+bad_sectors for the caller.
+
+The return value rule is consistent for multiple ranges. For example if
+there are following bad block ranges in bad block table,
+   Index No.     Start        Len         Ack
+       0          400          20          1
+       1          500          50          1
+       2          650          20          0
+the return value, first_bad, bad_sectors by calling badblocks_set() with
+different checking range can be the following values,
+    Checking Start, Len     Return Value   first_bad    bad_sectors
+               100, 100          0           N/A           N/A
+               100, 310          1           400           10
+               100, 440          1           400           10
+               100, 540          1           400           10
+               100, 600         -1           400           10
+               100, 800         -1           400           10
+
+In order to make code review easier, this patch names the improved bad
+block range checking routine as _badblocks_check() and does not change
+existing badblock_check() code yet. Later patch will delete old code of
+badblocks_check() and make it as a wrapper to call _badblocks_check().
+Then the new added code won't mess up with the old deleted code, it will
+be more clear and easier for code review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 97 insertions(+)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index 3e1bb593a2bb..bfade2434c74 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1256,6 +1256,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ 	return rv;
+ }
+ 
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++			    sector_t *first_bad, int *bad_sectors)
++{
++	int unacked_badblocks, acked_badblocks;
++	int prev = -1, hint = -1, set = 0;
++	struct badblocks_context bad;
++	unsigned int seq;
++	int len, rv;
++	u64 *p;
++
++	WARN_ON(bb->shift < 0 || sectors == 0);
++
++	if (bb->shift > 0) {
++		sector_t target;
++
++		/* round the start down, and the end up */
++		target = s + sectors;
++		rounddown(s, bb->shift);
++		roundup(target, bb->shift);
++		sectors = target - s;
++	}
++
++retry:
++	seq = read_seqbegin(&bb->lock);
++
++	p = bb->page;
++	unacked_badblocks = 0;
++	acked_badblocks = 0;
++
++re_check:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (BB_ACK(p[prev]))
++			acked_badblocks++;
++		else
++			unacked_badblocks++;
++
++		if (BB_END(p[prev]) >= (s + sectors))
++			len = sectors;
++		else
++			len = BB_END(p[prev]) - s;
++
++		if (set == 0) {
++			*first_bad = BB_OFFSET(p[prev]);
++			*bad_sectors = BB_LEN(p[prev]);
++			set = 1;
++		}
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		goto update_sectors;
++	}
++
++	/* not cover any badblocks range in the table */
++	len = sectors;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_check;
++
++	WARN_ON(sectors < 0);
++
++	if (unacked_badblocks > 0)
++		rv = -1;
++	else if (acked_badblocks > 0)
++		rv = 1;
++	else
++		rv = 0;
++
++	if (read_seqretry(&bb->lock, seq))
++		goto retry;
++
++	return rv;
++}
+ 
+ /**
+  * badblocks_check() - check a given range for bad sectors
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
new file mode 100644
index 0000000..837c7fe
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch
@@ -0,0 +1,365 @@
+From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 2 Mar 2021 10:48:43 +0800
+Subject: [PATCH v5 6/6] badblocks: switch to the improved badblock handling
+ code
+
+This patch removes old code of badblocks_set(), badblocks_clear() and
+badblocks_check(), and make them as wrappers to call _badblocks_set(),
+_badblocks_clear() and _badblocks_check().
+
+By this change now the badblock handing switch to the improved algorithm
+in  _badblocks_set(), _badblocks_clear() and _badblocks_check().
+
+This patch only contains the changes of old code deletion, new added
+code for the improved algorithms are in previous patches.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ block/badblocks.c | 310 +---------------------------------------------
+ 1 file changed, 3 insertions(+), 307 deletions(-)
+
+diff --git a/block/badblocks.c b/block/badblocks.c
+index bfade2434c74..78f2af9295e6 100644
+--- a/block/badblocks.c
++++ b/block/badblocks.c
+@@ -1391,75 +1391,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ 			sector_t *first_bad, int *bad_sectors)
+ {
+-	int hi;
+-	int lo;
+-	u64 *p = bb->page;
+-	int rv;
+-	sector_t target = s + sectors;
+-	unsigned seq;
+-
+-	if (bb->shift > 0) {
+-		/* round the start down, and the end up */
+-		s >>= bb->shift;
+-		target += (1<<bb->shift) - 1;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-	/* 'target' is now the first block after the bad range */
+-
+-retry:
+-	seq = read_seqbegin(&bb->lock);
+-	lo = 0;
+-	rv = 0;
+-	hi = bb->count;
+-
+-	/* Binary search between lo and hi for 'target'
+-	 * i.e. for the last range that starts before 'target'
+-	 */
+-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+-	 * are known not to be the last range before target.
+-	 * VARIANT: hi-lo is the number of possible
+-	 * ranges, and decreases until it reaches 1
+-	 */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			/* This could still be the one, earlier ranges
+-			 * could not.
+-			 */
+-			lo = mid;
+-		else
+-			/* This and later ranges are definitely out. */
+-			hi = mid;
+-	}
+-	/* 'lo' might be the last that started before target, but 'hi' isn't */
+-	if (hi > lo) {
+-		/* need to check all range that end after 's' to see if
+-		 * any are unacknowledged.
+-		 */
+-		while (lo >= 0 &&
+-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+-			if (BB_OFFSET(p[lo]) < target) {
+-				/* starts before the end, and finishes after
+-				 * the start, so they must overlap
+-				 */
+-				if (rv != -1 && BB_ACK(p[lo]))
+-					rv = 1;
+-				else
+-					rv = -1;
+-				*first_bad = BB_OFFSET(p[lo]);
+-				*bad_sectors = BB_LEN(p[lo]);
+-			}
+-			lo--;
+-		}
+-	}
+-
+-	if (read_seqretry(&bb->lock, seq))
+-		goto retry;
+-
+-	return rv;
++	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_check);
+ 
+@@ -1481,154 +1413,7 @@ EXPORT_SYMBOL_GPL(badblocks_check);
+ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ 			int acknowledged)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	int rv = 0;
+-	unsigned long flags;
+-
+-	if (bb->shift < 0)
+-		/* badblocks are disabled */
+-		return 1;
+-
+-	if (bb->shift) {
+-		/* round the start down, and the end up */
+-		sector_t next = s + sectors;
+-
+-		s >>= bb->shift;
+-		next += (1<<bb->shift) - 1;
+-		next >>= bb->shift;
+-		sectors = next - s;
+-	}
+-
+-	write_seqlock_irqsave(&bb->lock, flags);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts at-or-before 's' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a <= s)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo && BB_OFFSET(p[lo]) > s)
+-		hi = lo;
+-
+-	if (hi > lo) {
+-		/* we found a range that might merge with the start
+-		 * of our new range
+-		 */
+-		sector_t a = BB_OFFSET(p[lo]);
+-		sector_t e = a + BB_LEN(p[lo]);
+-		int ack = BB_ACK(p[lo]);
+-
+-		if (e >= s) {
+-			/* Yes, we can merge with a previous range */
+-			if (s == a && s + sectors >= e)
+-				/* new range covers old */
+-				ack = acknowledged;
+-			else
+-				ack = ack && acknowledged;
+-
+-			if (e < s + sectors)
+-				e = s + sectors;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[lo] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				/* does not all fit in one range,
+-				 * make p[lo] maximal
+-				 */
+-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
+-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-		}
+-	}
+-	if (sectors && hi < bb->count) {
+-		/* 'hi' points to the first range that starts after 's'.
+-		 * Maybe we can merge with the start of that range
+-		 */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		sector_t e = a + BB_LEN(p[hi]);
+-		int ack = BB_ACK(p[hi]);
+-
+-		if (a <= s + sectors) {
+-			/* merging is possible */
+-			if (e <= s + sectors) {
+-				/* full overlap */
+-				e = s + sectors;
+-				ack = acknowledged;
+-			} else
+-				ack = ack && acknowledged;
+-
+-			a = s;
+-			if (e - a <= BB_MAX_LEN) {
+-				p[hi] = BB_MAKE(a, e-a, ack);
+-				s = e;
+-			} else {
+-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+-				s = a + BB_MAX_LEN;
+-			}
+-			sectors = e - s;
+-			lo = hi;
+-			hi++;
+-		}
+-	}
+-	if (sectors == 0 && hi < bb->count) {
+-		/* we might be able to combine lo and hi */
+-		/* Note: 's' is at the end of 'lo' */
+-		sector_t a = BB_OFFSET(p[hi]);
+-		int lolen = BB_LEN(p[lo]);
+-		int hilen = BB_LEN(p[hi]);
+-		int newlen = lolen + hilen - (s - a);
+-
+-		if (s >= a && newlen < BB_MAX_LEN) {
+-			/* yes, we can combine them */
+-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+-
+-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+-			memmove(p + hi, p + hi + 1,
+-				(bb->count - hi - 1) * 8);
+-			bb->count--;
+-		}
+-	}
+-	while (sectors) {
+-		/* didn't merge (it all).
+-		 * Need to add a range just before 'hi'
+-		 */
+-		if (bb->count >= MAX_BADBLOCKS) {
+-			/* No room for more */
+-			rv = 1;
+-			break;
+-		} else {
+-			int this_sectors = sectors;
+-
+-			memmove(p + hi + 1, p + hi,
+-				(bb->count - hi) * 8);
+-			bb->count++;
+-
+-			if (this_sectors > BB_MAX_LEN)
+-				this_sectors = BB_MAX_LEN;
+-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+-			sectors -= this_sectors;
+-			s += this_sectors;
+-		}
+-	}
+-
+-	bb->changed = 1;
+-	if (!acknowledged)
+-		bb->unacked_exist = 1;
+-	else
+-		badblocks_update_acked(bb);
+-	write_sequnlock_irqrestore(&bb->lock, flags);
+-
+-	return rv;
++	return _badblocks_set(bb, s, sectors, acknowledged);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_set);
+ 
+@@ -1648,96 +1433,7 @@ EXPORT_SYMBOL_GPL(badblocks_set);
+  */
+ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+ {
+-	u64 *p;
+-	int lo, hi;
+-	sector_t target = s + sectors;
+-	int rv = 0;
+-
+-	if (bb->shift > 0) {
+-		/* When clearing we round the start up and the end down.
+-		 * This should not matter as the shift should align with
+-		 * the block size and no rounding should ever be needed.
+-		 * However it is better the think a block is bad when it
+-		 * isn't than to think a block is not bad when it is.
+-		 */
+-		s += (1<<bb->shift) - 1;
+-		s >>= bb->shift;
+-		target >>= bb->shift;
+-		sectors = target - s;
+-	}
+-
+-	write_seqlock_irq(&bb->lock);
+-
+-	p = bb->page;
+-	lo = 0;
+-	hi = bb->count;
+-	/* Find the last range that starts before 'target' */
+-	while (hi - lo > 1) {
+-		int mid = (lo + hi) / 2;
+-		sector_t a = BB_OFFSET(p[mid]);
+-
+-		if (a < target)
+-			lo = mid;
+-		else
+-			hi = mid;
+-	}
+-	if (hi > lo) {
+-		/* p[lo] is the last range that could overlap the
+-		 * current range.  Earlier ranges could also overlap,
+-		 * but only this one can overlap the end of the range.
+-		 */
+-		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+-		    (BB_OFFSET(p[lo]) < target)) {
+-			/* Partial overlap, leave the tail of this range */
+-			int ack = BB_ACK(p[lo]);
+-			sector_t a = BB_OFFSET(p[lo]);
+-			sector_t end = a + BB_LEN(p[lo]);
+-
+-			if (a < s) {
+-				/* we need to split this range */
+-				if (bb->count >= MAX_BADBLOCKS) {
+-					rv = -ENOSPC;
+-					goto out;
+-				}
+-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+-				bb->count++;
+-				p[lo] = BB_MAKE(a, s-a, ack);
+-				lo++;
+-			}
+-			p[lo] = BB_MAKE(target, end - target, ack);
+-			/* there is no longer an overlap */
+-			hi = lo;
+-			lo--;
+-		}
+-		while (lo >= 0 &&
+-		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+-		       (BB_OFFSET(p[lo]) < target)) {
+-			/* This range does overlap */
+-			if (BB_OFFSET(p[lo]) < s) {
+-				/* Keep the early parts of this range. */
+-				int ack = BB_ACK(p[lo]);
+-				sector_t start = BB_OFFSET(p[lo]);
+-
+-				p[lo] = BB_MAKE(start, s - start, ack);
+-				/* now low doesn't overlap, so.. */
+-				break;
+-			}
+-			lo--;
+-		}
+-		/* 'lo' is strictly before, 'hi' is strictly after,
+-		 * anything between needs to be discarded
+-		 */
+-		if (hi - lo > 1) {
+-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+-			bb->count -= (hi - lo - 1);
+-		}
+-	}
+-
+-	badblocks_update_acked(bb);
+-	bb->changed = 1;
+-out:
+-	write_sequnlock_irq(&bb->lock);
+-	return rv;
++	return _badblocks_clear(bb, s, sectors);
+ }
+ EXPORT_SYMBOL_GPL(badblocks_clear);
+ 
+-- 
+2.31.1
+
diff --git a/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch
new file mode 100644
index 0000000..790b136
--- /dev/null
+++ b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch
@@ -0,0 +1,2303 @@
+From 249fc077edbeacb388b7aea11f1f2ce4c0a242c5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 10 Dec 2021 14:30:26 +0800
+Subject: [PATCH v5] test: user space code to test badblocks APIs
+
+This is the user space test code to verifiy badblocks API, not part of
+kernel patch, don't review this patch.
+
+Except for badblocks_show(), the rested code logic for badblocks_set(),
+badblocks_clear(), badblocks_check() are identical to the kernel code.
+
+The basic idea of the testing code follows the following steps,
+1) Generate a random bad blocks range (start offset and length), for
+   random set or clear operation. See write_badblocks_file() for this.
+2) Call badblocks_set() or badblocks_clear() APIs, and record the state
+   in a log file named with seq- prefix. See write_badblocks_log() for
+   this.
+3) Write sectors into dummy disk file for the corresponding bad blocks
+   range. E.g. the unacknowledged bad blocks setting writes value 1,
+   the acknowledged bad blocks setting writes value 2, and the clear
+   setting writes value 0. See _write_diskfile() for this.
+4) Compare all bad blocks ranges with the dummy disk file, if the sector
+   from the dummy disk file has unexpected value against the correspond-
+   ing bad block range, stop the loop of testing and ask people to do
+   manual verification from the seq-* log files. verify_badblocks_file()
+   does the verification.
+
+With this testing code, most of simple conditions are verified, only the
+complicated situations require manual check.
+
+There are 3 parameters can be modified in this test code,
+- MAX_BB_TEST_TRIES
+  How many times of the bad blocks set/clear and verification loop, the
+loop may exit earlier if verify_badblocks_file() encounters unexpected
+sector value and requires manual check.
+- MAX_SET_SIZE
+  The max size of random badblocks set range. A larger range may fill
+up all 512 badblock slots earlier.
+- MAX_CLN_SIZE
+  The max size of random badblocks clear range. A larger range may
+prevent all 512 badblock slots from being full filled.
+
+Of course the testing code is not perfect, this is the try-best effort
+to verify simple conditions of bad blocks setting/clearing with random
+generated ranges. For complicated situations, manual check by people are
+still necessary.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Geliang Tang <geliang.tang@suse.com>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Richard Fan <richard.fan@suse.com>
+Cc: Vishal L Verma <vishal.l.verma@intel.com>
+---
+ Makefile    |    4 +
+ badblocks.c | 2222 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 2226 insertions(+)
+ create mode 100644 Makefile
+ create mode 100644 badblocks.c
+
+diff --git a/Makefile b/Makefile
+new file mode 100644
+index 0000000..2287363
+--- /dev/null
++++ b/Makefile
+@@ -0,0 +1,4 @@
++badblocks: badblocks.o 
++	gcc -o badblocks -g3 -Wall badblocks.c
++clean:
++	rm -f badblocks badblocks.o
+diff --git a/badblocks.c b/badblocks.c
+new file mode 100644
+index 0000000..e5b2cd0
+--- /dev/null
++++ b/badblocks.c
+@@ -0,0 +1,2222 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Bad block management
++ *
++ * - Heavily based on MD badblocks code from Neil Brown
++ *
++ * Copyright (c) 2015, Intel Corporation.
++ *
++ * Improvement for handling multiple ranges by Coly Li <colyli@suse.de>
++ */
++
++#define _GNU_SOURCE             /* See feature_test_macros(7) */
++#include <stdlib.h>
++#include <linux/types.h>
++#include <stdio.h>
++#include <errno.h>
++#include <string.h>
++#include <limits.h>
++#include <assert.h>
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++
++extern int errno;
++
++#define PAGE_SIZE       4096
++typedef unsigned long long sector_t;
++typedef unsigned long long u64;
++typedef _Bool bool;
++
++#define BB_LEN_MASK     (0x00000000000001FFULL)
++#define BB_OFFSET_MASK  (0x7FFFFFFFFFFFFE00ULL)
++#define BB_ACK_MASK     (0x8000000000000000ULL)
++#define BB_MAX_LEN      512
++#define BB_OFFSET(x)    (((x) & BB_OFFSET_MASK) >> 9)
++#define BB_LEN(x)       (((x) & BB_LEN_MASK) + 1)
++#define BB_END(x)       (BB_OFFSET(x) + BB_LEN(x))
++#define BB_ACK(x)       (!!((x) & BB_ACK_MASK))
++#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
++
++/* Bad block numbers are stored in a single page.
++ * 64bits is used for each block or extent.
++ * 54 bits are sector number, 9 bits are extent size,
++ * 1 bit is an 'acknowledged' flag.
++ */
++#define MAX_BADBLOCKS   (PAGE_SIZE/8)
++#define GFP_KERNEL      0
++#define true    1
++#define false   0
++
++#define WARN_ON(condition) ({ \
++		if (!!(condition)) \
++		    printf("warning on %s:%d\n", __func__, __LINE__); \
++})
++
++#define BUG() ({printf("BUG on %s:%d\n", __func__, __LINE__); exit(1);})
++
++struct device {
++	int val;
++};
++
++struct badblocks {
++	struct device *dev;
++	int count;              /* count of bad blocks */
++        int unacked_exist;      /* there probably are unacknowledged
++                                 * bad blocks.  This is only cleared
++                                 * when a read discovers none
++                                 */
++        int shift;              /* shift from sectors to block size
++                                 * a -ve shift means badblocks are
++                                 * disabled.*/
++        u64 *page;              /* badblock list */
++        int changed;
++	unsigned long lock;
++        sector_t sector;
++        sector_t size;          /* in sectors */
++};
++
++struct badblocks_context {
++        sector_t start;
++        sector_t len;
++        sector_t orig_start;
++        sector_t orig_len;
++        int ack;
++        int first_prev;
++};
++
++int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++                   sector_t *first_bad, int *bad_sectors);
++int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++                        int acknowledged);
++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
++void ack_all_badblocks(struct badblocks *bb);
++ssize_t badblocks_show(struct badblocks *bb, int unack);
++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
++                        int unack);
++int badblocks_init(struct badblocks *bb, int enable);
++void badblocks_exit(struct badblocks *bb);
++
++static inline void* kzalloc(int size, int flag)
++{
++        void * p = malloc(size);
++        memset(p, 0, size);
++        return p;
++}
++
++static inline void kfree(void* page)
++{
++        free(page);
++}
++
++#define roundup(x, y) (                                 \
++{                                                       \
++        typeof(y) __y = y;                              \
++        (((x) + (__y - 1)) / __y) * __y;                \
++}                                                       \
++)
++
++#define rounddown(x, y) (                               \
++{                                                       \
++        typeof(x) __x = (x);                            \
++        __x - (__x % (y));                              \
++}                                                       \
++)
++
++#define fallthrough     do{}while(0)
++
++/**
++ * min - return minimum of two values of the same or compatible types
++ * @x: first value
++ * @y: second value
++ */
++#define min(x, y)        ((x) < (y) ? (x) : (y))
++#define min_t(t, x, y) ((x) < (y) ? (x) : (y))
++
++#define write_seqlock_irqsave(_lock, _flags)	((_flags) = *(_lock))
++#define write_sequnlock_irqrestore(_lock, _flags) ((*(_lock)) = (_flags))
++#define write_seqlock_irq(lock) do{}while(0)
++#define write_sequnlock_irq(lock) do{}while(0)
++#define read_seqbegin(lock) 1
++#define read_seqretry(lock, seq)  (!!((seq) && 0)) 
++#define seqlock_init(lock) do{}while(0)
++#define EXPORT_SYMBOL_GPL(sym)
++
++static void *devm_kzalloc(struct device *dev, int size, int flags)
++{
++	void * buf = malloc(size);
++	if (buf)
++		memset(buf, 0, size);
++	return buf;
++}
++
++static void devm_kfree(struct device *dev, void *mem)
++{
++	free(mem);
++}
++
++static inline int badblocks_full(struct badblocks *bb)
++{
++        return (bb->count >= MAX_BADBLOCKS);
++}
++
++static inline int badblocks_empty(struct badblocks *bb)
++{
++        return (bb->count == 0);
++}
++
++static inline void set_changed(struct badblocks *bb)
++{
++        if (bb->changed != 1)
++                bb->changed = 1;
++}
++
++/*
++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
++ * identified by LBA addresses.
++ *
++ * When the caller of badblocks_set() wants to set a range of bad blocks, the
++ * setting range can be acked or unacked. And the setting range may merge,
++ * overwrite, skip the overlapped already set range, depends on who they are
++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
++ * more complicated when the setting range covers multiple already set bad block
++ * ranges, with restrictions of maximum length of each bad range and the bad
++ * table space limitation.
++ *
++ * It is difficult and unnecessary to take care of all the possible situations,
++ * for setting a large range of bad blocks, we can handle it by dividing the
++ * large range into smaller ones when encounter overlap, max range length or
++ * bad table full conditions. Every time only a smaller piece of the bad range
++ * is handled with a limited number of conditions how it is interacted with
++ * possible overlapped or adjacent already set bad block ranges. Then the hard
++ * complicated problem can be much simpler to handle in proper way.
++ *
++ * When setting a range of bad blocks to the bad table, the simplified situations
++ * to be considered are, (The already set bad blocks ranges are naming with
++ *  prefix E, and the setting bad blocks range is naming with prefix S)
++ *
++ * 1) A setting range is not overlapped or adjacent to any other already set bad
++ *    block range.
++ *                         +--------+
++ *                         |    S   |
++ *                         +--------+
++ *        +-------------+               +-------------+
++ *        |      E1     |               |      E2     |
++ *        +-------------+               +-------------+
++ *    For this situation if the bad blocks table is not full, just allocate a
++ *    free slot from the bad blocks table to mark the setting range S. The
++ *    result is,
++ *        +-------------+  +--------+   +-------------+
++ *        |      E1     |  |    S   |   |      E2     |
++ *        +-------------+  +--------+   +-------------+
++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
++ *    range.
++ * 2.1) The setting range size < already set range size
++ *        +--------+
++ *        |    S   |
++ *        +--------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
++ *    An extra slot from the bad blocks table will be allocated for S, and head
++ *    of E will move to end of the inserted range S. The result is,
++ *        +--------+----+
++ *        |    S   | E  |
++ *        +--------+----+
++ * 2.2) The setting range size == already set range size
++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
++ *    be merged into existing bad range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
++ *    the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
++      bad blocks range E. The result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 2.3) The setting range size > already set range size
++ *        +-------------------+
++ *        |          S        |
++ *        +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For such situation, the setting range S can be treated as two parts, the
++ *    first part (S1) is as same size as the already set range E, the second
++ *    part (S2) is the rest of setting range.
++ *        +-------------+-----+        +-------------+       +-----+
++ *        |    S1       | S2  |        |     S1      |       | S2  |
++ *        +-------------+-----+  ===>  +-------------+       +-----+
++ *        +-------------+              +-------------+
++ *        |      E      |              |      E      |
++ *        +-------------+              +-------------+
++ *    Now we only focus on how to handle the setting range S1 and already set
++ *    range E, which are already explained in 2.2), for the rest S2 it will be
++ *    handled later in next loop.
++ * 3) A setting range starts before the start LBA of an already set bad blocks
++ *    range.
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ *             +-------------+
++ *             |      E      |
++ *             +-------------+
++ *    For this situation, the setting range S can be divided into two parts, the
++ *    first (S1) ends at the start LBA of already set range E, the second part
++ *    (S2) starts exactly at a start LBA of the already set range E.
++ *        +----+---------+             +----+      +---------+
++ *        | S1 |    S2   |             | S1 |      |    S2   |
++ *        +----+---------+      ===>   +----+      +---------+
++ *             +-------------+                     +-------------+
++ *             |      E      |                     |      E      |
++ *             +-------------+                     +-------------+
++ *    Now only the first part S1 should be handled in this loop, which is in
++ *    similar condition as 1). The rest part S2 has exact same start LBA address
++ *    of the already set range E, they will be handled in next loop in one of
++ *    situations in 2).
++ * 4) A setting range starts after the start LBA of an already set bad blocks
++ *    range.
++ * 4.1) If the setting range S exactly matches the tail part of already set bad
++ *    blocks range E, like the following chart shows,
++ *            +---------+
++ *            |   S     |
++ *            +---------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +-------------+
++ *        |      S      |
++ *        +-------------+
++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is,
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
++ *    overwrite the overlapped range of E, the result is,
++ *        +---+---------+
++ *        | E |    S    |
++ *        +---+---------+
++ * 4.2) If the setting range S stays in middle of an already set range E, like
++ *    the following chart shows,
++ *             +----+
++ *             | S  |
++ *             +----+
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
++ *    they will be merged into one, the result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
++ *    request of S will be rejected, the result is also,
++ *        +--------------+
++ *        |       E      |
++ *        +--------------+
++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
++ *    inserted into middle of E and split previous range E into twp parts (E1
++ *    and E2), the result is,
++ *        +----+----+----+
++ *        | E1 |  S | E2 |
++ *        +----+----+----+
++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
++ *    blocks range E. The range S starts after the start LBA of range E, and
++ *    ends after the end LBA of range E, as the following chart shows,
++ *            +-------------------+
++ *            |          S        |
++ *            +-------------------+
++ *        +-------------+
++ *        |      E      |
++ *        +-------------+
++ *    For this situation the range S can be divided into two parts, the first
++ *    part (S1) ends at end range E, and the second part (S2) has rest range of
++ *    origin S.
++ *            +---------+---------+            +---------+      +---------+
++ *            |    S1   |    S2   |            |    S1   |      |    S2   |
++ *            +---------+---------+  ===>      +---------+      +---------+
++ *        +-------------+                  +-------------+
++ *        |      E      |                  |      E      |
++ *        +-------------+                  +-------------+
++ *     Now in this loop the setting range S1 and already set range E can be
++ *     handled as the situations 4), the rest range S2 will be handled in next
++ *     loop and ignored in this loop.
++ * 5) A setting bad blocks range S is adjacent to one or more already set bad
++ *    blocks range(s), and they are all acked or unacked range.
++ * 5.1) Front merge: If the already set bad blocks range E is before setting
++ *    range S and they are adjacent,
++ *                +------+
++ *                |  S   |
++ *                +------+
++ *        +-------+
++ *        |   E   |
++ *        +-------+
++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can front merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
++ *    range S right after already set range E into the bad blocks table. The
++ *    result is,
++ *        +--------+------+
++ *        |   E    |   S  |
++ *        +--------+------+
++ * 6) Special cases which above conditions cannot handle
++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
++ *        +-------------------------------------------------------+
++ *        |                           S                           |
++ *        +-------------------------------------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+     +-----+   +-----+
++ *                                 | E1  |     | E2  |   | E3  |
++ *                                 +-----+     +-----+   +-----+
++ *     In the above example, when the bad blocks table is full, inserting the
++ *     first part of setting range S will fail because no more available slot
++ *     can be allocated from bad blocks table. In this situation a proper
++ *     setting method should be go though all the setting bad blocks range and
++ *     look for chance to merge already set ranges into less ones. When there
++ *     is available slot from bad blocks table, re-try again to handle more
++ *     setting bad blocks ranges as many as possible.
++ *        +------------------------+
++ *        |          S3            |
++ *        +------------------------+
++ *        |<----- BB_MAX_LEN ----->|
++ *                                 +-----+-----+-----+---+-----+--+
++ *                                 |       S1        |     S2     |
++ *                                 +-----+-----+-----+---+-----+--+
++ *     The above chart shows although the first part (S3) cannot be inserted due
++ *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
++ *     can be merged with rest part of S into less range S1 and S2. Now there is
++ *     1 free slot in bad blocks table.
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *        |           S3           |       S1        |     S2     |
++ *        +------------------------+-----+-----+-----+---+-----+--+
++ *     Since the bad blocks table is not full anymore, re-try again for the
++ *     origin setting range S. Now the setting range S3 can be inserted into the
++ *     bad blocks table with previous freed slot from multiple ranges merge.
++ * 6.2) Front merge after overwrite
++ *    In the following example, in bad blocks table, E1 is an acked bad blocks
++ *    range and E2 is an unacked bad blocks range, therefore they are not able
++ *    to merge into a larger range. The setting bad blocks range S is acked,
++ *    therefore part of E2 can be overwritten by S.
++ *                      +--------+
++ *                      |    S   |                             acknowledged
++ *                      +--------+                         S:       1
++ *              +-------+-------------+                   E1:       1
++ *              |   E1  |    E2       |                   E2:       0
++ *              +-------+-------------+
++ *     With previous simplified routines, after overwriting part of E2 with S,
++ *     the bad blocks table should be (E3 is remaining part of E2 which is not
++ *     overwritten by S),
++ *                                                             acknowledged
++ *              +-------+--------+----+                    S:       1
++ *              |   E1  |    S   | E3 |                   E1:       1
++ *              +-------+--------+----+                   E3:       0
++ *     The above result is correct but not perfect. Range E1 and S in the bad
++ *     blocks table are all acked, merging them into a larger one range may
++ *     occupy less bad blocks table space and make badblocks_check() faster.
++ *     Therefore in such situation, after overwriting range S, the previous range
++ *     E1 should be checked for possible front combination. Then the ideal
++ *     result can be,
++ *              +----------------+----+                        acknowledged
++ *              |       E1       | E3 |                   E1:       1
++ *              +----------------+----+                   E3:       0
++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
++ *    range S and they are adjacent. Normally we don't need to care about this
++ *    because front merge handles this while going though range S from head to
++ *    tail, except for the tail part of range S. When the setting range S are
++ *    fully handled, all the above simplified routine doesn't check whether the
++ *    tail LBA of range S is adjacent to the next already set range and not able
++ *    to them if they are mergeable.
++ *        +------+
++ *        |  S   |
++ *        +------+
++ *               +-------+
++ *               |   E   |
++ *               +-------+
++ *    For the above special situation, when the setting range S are all handled
++ *    and the loop ends, an extra check is necessary for whether next already
++ *    set range E is right after S and mergeable.
++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
++ *    values are same, the setting range S can behind merges into range E. The
++ *    result is,
++ *        +--------------+
++ *        |       S      |
++ *        +--------------+
++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range
++ *     S in front of the already set range E in the bad blocks table. The result
++ *     is,
++ *        +------+-------+
++ *        |  S   |   E   |
++ *        +------+-------+
++ *
++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
++ * the bad block range setting conditions. Maybe there is some rare corner case
++ * is not considered and optimized, it won't hurt if badblocks_set() fails due
++ * to no space, or some ranges are not merged to save bad blocks table space.
++ *
++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
++ * time for the new loop prev_badblocks() is called to find an already set range
++ * which starts before or at current setting range. Since the setting bad blocks
++ * range is handled from head to tail, most of the cases it is unnecessary to do
++ * the binary search inside prev_badblocks(), it is possible to provide a hint
++ * to prev_badblocks() for a fast path, then the expensive binary search can be
++ * avoided. In my test with the hint to prev_badblocks(), except for the first
++ * loop, all rested calls to prev_badblocks() can go into the fast path and
++ * return correct bad blocks table index immediately.
++ *
++ *
++ * Clearing a bad blocks range from the bad block table has similar idea as
++ * setting does, but much more simpler. The only thing needs to be noticed is
++ * when the clearing range hits middle of a bad block range, the existing bad
++ * block range will split into two, and one more item should be added into the
++ * bad block table. The simplified situations to be considered are, (The already
++ * set bad blocks ranges in bad block table are naming with prefix E, and the
++ * clearing bad blocks range is naming with prefix C)
++ *
++ * 1) A clearing range is not overlapped to any already set ranges in bad block
++ *    table.
++ *    +-----+         |          +-----+         |          +-----+
++ *    |  C  |         |          |  C  |         |          |  C  |
++ *    +-----+         or         +-----+         or         +-----+
++ *            +---+   |   +----+         +----+  |  +---+
++ *            | E |   |   | E1 |         | E2 |  |  | E |
++ *            +---+   |   +----+         +----+  |  +---+
++ *    For the above situations, no bad block to be cleared and no failure
++ *    happens, simply returns 0.
++ * 2) The clearing range hits middle of an already setting bad blocks range in
++ *    the bad block table.
++ *            +---+
++ *            | C |
++ *            +---+
++ *     +-----------------+
++ *     |         E       |
++ *     +-----------------+
++ *    In this situation if the bad block table is not full, the range E will be
++ *    split into two ranges E1 and E2. The result is,
++ *     +------+   +------+
++ *     |  E1  |   |  E2  |
++ *     +------+   +------+
++ * 3) The clearing range starts exactly at same LBA as an already set bad block range
++ *    from the bad block table.
++ * 3.1) Partially covered at head part
++ *         +------------+
++ *         |     C      |
++ *         +------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation, the overlapped already set range will update the
++ *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
++ *    item deleted from bad block table. The result is,
++ *                      +----+
++ *                      | E1 |
++ *                      +----+
++ * 3.2) Exact fully covered
++ *         +-----------------+
++ *         |         C       |
++ *         +-----------------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For this situation the whole bad blocks range E will be cleared and its
++ *    corresponded item is deleted from the bad block table.
++ * 4) The clearing range exactly ends at same LBA as an already set bad block
++ *    range.
++ *                   +-------+
++ *                   |   C   |
++ *                   +-------+
++ *         +-----------------+
++ *         |         E       |
++ *         +-----------------+
++ *    For the above situation, the already set range E is updated to shrink its
++ *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
++ *    The result is,
++ *         +---------+
++ *         |    E    |
++ *         +---------+
++ * 5) The clearing range is partially overlapped with an already set bad block
++ *    range from the bad block table.
++ * 5.1) The already set bad block range is front overlapped with the clearing
++ *    range.
++ *         +----------+
++ *         |     C    |
++ *         +----------+
++ *              +------------+
++ *              |      E     |
++ *              +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part ends at the start LBA of range E, and the second part starts at
++ *   same LBA of range E.
++ *         +----+-----+               +----+   +-----+
++ *         | C1 | C2  |               | C1 |   | C2  |
++ *         +----+-----+         ===>  +----+   +-----+
++ *              +------------+                 +------------+
++ *              |      E     |                 |      E     |
++ *              +------------+                 +------------+
++ *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
++ *   handled as condition 3.1) in next loop.
++ * 5.2) The already set bad block range is behind overlaopped with the clearing
++ *   range.
++ *                 +----------+
++ *                 |     C    |
++ *                 +----------+
++ *         +------------+
++ *         |      E     |
++ *         +------------+
++ *   For such situation, the clearing range C can be treated as two parts. The
++ *   first part C1 ends at same end LBA of range E, and the second part starts
++ *   at end LBA of range E.
++ *                 +----+-----+                 +----+    +-----+
++ *                 | C1 | C2  |                 | C1 |    | C2  |
++ *                 +----+-----+  ===>           +----+    +-----+
++ *         +------------+               +------------+
++ *         |      E     |               |      E     |
++ *         +------------+               +------------+
++ *   Now the first part clearing range C1 can be handled as condition 4), and
++ *   the second part clearing range C2 can be handled as condition 1) in next
++ *   loop.
++ *
++ *   All bad blocks range clearing can be simplified into the above 5 situations
++ *   by only handling the head part of the clearing range in each run of the
++ *   while-loop. The idea is similar to bad blocks range setting but much
++ *   simpler.
++ */
++
++/*
++ * Find the range starts at-or-before 's' from bad table. The search
++ * starts from index 'hint' and stops at index 'hint_end' from the bad
++ * table.
++ */
++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
++{
++	int hint_end = hint + 2;
++	u64 *p = bb->page;
++	int ret = -1;
++
++	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
++	       (BB_OFFSET(p[hint]) <= s)) {
++		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
++			ret = hint;
++			break;
++		}
++		hint++;
++	}
++
++	return ret;
++}
++
++/*
++ * Find the range starts at-or-before bad->start. If 'hint' is provided
++ * (hint >= 0) then search in the bad table from hint firstly. It is
++ * very probably the wanted bad range can be found from the hint index,
++ * then the unnecessary while-loop iteration can be avoided.
++ */
++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
++			  int hint)
++{
++	sector_t s = bad->start;
++	int ret = -1;
++	int lo, hi;
++	u64 *p;
++
++	if (!bb->count)
++		goto out;
++
++	if (hint >= 0) {
++		ret = prev_by_hint(bb, s, hint);
++		if (ret >= 0)
++			goto out;
++	}
++
++	lo = 0;
++	hi = bb->count;
++	p = bb->page;
++
++	while (hi - lo > 1) {
++		int mid = (lo + hi)/2;
++		sector_t a = BB_OFFSET(p[mid]);
++
++		if (a <= s)
++			lo = mid;
++		else
++			hi = mid;
++	}
++
++	if (BB_OFFSET(p[lo]) <= s)
++		ret = lo;
++out:
++	return ret;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be backward merged
++ * with the bad range (from the bad table) index by 'behind'.
++ */
++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad,
++			     int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++
++	if ((s <= BB_OFFSET(p[behind])) &&
++	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
++	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
++	    BB_ACK(p[behind]) == bad->ack)
++		return true;
++	return false;
++}
++
++/*
++ * Do backward merge for range indicated by 'bad' and the bad range
++ * (from the bad table) indexed by 'behind'. The return value is merged
++ * sectors from bad->len.
++ */
++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
++			int behind)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_OFFSET(p[behind]));
++	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
++
++	if (s < BB_OFFSET(p[behind])) {
++		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
++
++		merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s);
++		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
++	} else {
++		merged = min_t(sector_t, sectors, BB_LEN(p[behind]));
++	}
++
++	WARN_ON(merged == 0);
++
++	return merged;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can be forward
++ * merged with the bad range (from the bad table) indexed by 'prev'.
++ */
++static bool can_merge_front(struct badblocks *bb, int prev,
++			    struct badblocks_context *bad)
++{
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++
++	if (BB_ACK(p[prev]) == bad->ack &&
++	    (s < BB_END(p[prev]) ||
++	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
++		return true;
++	return false;
++}
++
++/*
++ * Do forward merge for range indicated by 'bad' and the bad range
++ * (from bad table) indexed by 'prev'. The return value is sectors
++ * merged from bad->len.
++ */
++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int merged = 0;
++
++	WARN_ON(s > BB_END(p[prev]));
++
++	if (s < BB_END(p[prev])) {
++		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
++	} else {
++		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
++		if ((prev + 1) < bb->count &&
++		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
++			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
++		}
++
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + merged, bad->ack);
++	}
++
++	return merged;
++}
++
++/*
++ * 'Combine' is a special case which can_merge_front() is not able to
++ * handle: If a bad range (indexed by 'prev' from bad table) exactly
++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
++ * these two bad range (from bad table) can be combined.
++ *
++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
++ * table can be combined.
++ */
++static bool can_combine_front(struct badblocks *bb, int prev,
++			      struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if ((prev > 0) &&
++	    (BB_OFFSET(p[prev]) == bad->start) &&
++	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
++	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
++	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
++		return true;
++	return false;
++}
++
++/*
++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
++ * table) into one larger bad range, and the new range is indexed by
++ * 'prev - 1'.
++ */
++static void front_combine(struct badblocks *bb, int prev)
++{
++	u64 *p = bb->page;
++
++	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
++			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
++			      BB_ACK(p[prev]));
++	if ((prev + 1) < bb->count)
++		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly forward
++ * overlapped with the bad range (from bad table) indexed by 'front'.
++ * Exactly forward overlap means the bad range (from bad table) indexed
++ * by 'prev' does not cover the whole range indicated by 'bad'.
++ */
++static bool overlap_front(struct badblocks *bb, int front,
++			  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++
++	if (bad->start >= BB_OFFSET(p[front]) &&
++	    bad->start < BB_END(p[front]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' is exactly backward
++ * overlapped with the bad range (from bad table) indexed by 'behind'.
++ */
++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
++			   int behind)
++{
++	u64 *p = bb->page;
++
++	if (bad->start < BB_OFFSET(p[behind]) &&
++	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
++		return true;
++	return false;
++}
++
++/*
++ * Return 'true' if the range indicated by 'bad' can overwrite the bad
++ * range (from bad table) indexed by 'prev'.
++ *
++ * The range indicated by 'bad' can overwrite the bad range indexed by
++ * 'prev' when,
++ * 1) The whole range indicated by 'bad' can cover partial or whole bad
++ *    range (from bad table) indexed by 'prev'.
++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
++ *    range 'prev'.
++ *
++ * If the overwriting doesn't cover the whole bad range (from bad table)
++ * indexed by 'prev', new range might be split from existing bad range,
++ * 1) The overwrite covers head or tail part of existing bad range, 1
++ *    extra bad range will be split and added into the bad table.
++ * 2) The overwrite covers middle of existing bad range, 2 extra bad
++ *    ranges will be split (ahead and after the overwritten range) and
++ *    added into the bad table.
++ * The number of extra split ranges of the overwriting is stored in
++ * 'extra' and returned for the caller.
++ */
++static bool can_front_overwrite(struct badblocks *bb, int prev,
++				struct badblocks_context *bad, int *extra)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(!overlap_front(bb, prev, bad));
++
++	if (BB_ACK(p[prev]) >= bad->ack)
++		return false;
++
++	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
++		len = BB_END(p[prev]) - bad->start;
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 0;
++		else
++			*extra = 1;
++
++		bad->len = len;
++	} else {
++		if (BB_OFFSET(p[prev]) == bad->start)
++			*extra = 1;
++		else
++		/*
++		 * prev range will be split into two, beside the overwritten
++		 * one, an extra slot needed from bad table.
++		 */
++			*extra = 2;
++	}
++
++	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
++		return false;
++
++	return true;
++}
++
++/*
++ * Do the overwrite from the range indicated by 'bad' to the bad range
++ * (from bad table) indexed by 'prev'.
++ * The previously called can_front_overwrite() will provide how many
++ * extra bad range(s) might be split and added into the bad table. All
++ * the splitting cases in the bad table will be handled here.
++ */
++static int front_overwrite(struct badblocks *bb, int prev,
++			   struct badblocks_context *bad, int extra)
++{
++	u64 *p = bb->page;
++	sector_t orig_end = BB_END(p[prev]);
++	int orig_ack = BB_ACK(p[prev]);
++
++	switch (extra) {
++	case 0:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
++				  bad->ack);
++		break;
++	case 1:
++		if (BB_OFFSET(p[prev]) == bad->start) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->len, bad->ack);
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start + bad->len,
++					      orig_end - BB_END(p[prev]),
++					      orig_ack);
++		} else {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  bad->start - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++			/*
++			 * prev +2 -> prev + 1 + 1, which is for,
++			 * 1) prev + 1: the slot index of the previous one
++			 * 2) + 1: one more slot for extra being 1.
++			 */
++			memmove(p + prev + 2, p + prev + 1,
++				(bb->count - prev - 1) * 8);
++			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		}
++		break;
++	case 2:
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  bad->start - BB_OFFSET(p[prev]),
++				  BB_ACK(p[prev]));
++		/*
++		 * prev + 3 -> prev + 1 + 2, which is for,
++		 * 1) prev + 1: the slot index of the previous one
++		 * 2) + 2: two more slots for extra being 2.
++		 */
++		memmove(p + prev + 3, p + prev + 1,
++			(bb->count - prev - 1) * 8);
++		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
++		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
++				      orig_end - BB_END(p[prev + 1]),
++				      BB_ACK(p[prev]));
++		break;
++	default:
++		break;
++	}
++
++	return bad->len;
++}
++
++/*
++ * Explicitly insert a range indicated by 'bad' to the bad table, where
++ * the location is indexed by 'at'.
++ */
++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	int len;
++
++	WARN_ON(badblocks_full(bb));
++
++	len = min_t(sector_t, bad->len, BB_MAX_LEN);
++	if (at < bb->count)
++		memmove(p + at + 1, p + at, (bb->count - at) * 8);
++	p[at] = BB_MAKE(bad->start, len, bad->ack);
++
++	return len;
++}
++
++static void badblocks_update_acked(struct badblocks *bb)
++{
++	bool unacked = false;
++	u64 *p = bb->page;
++	int i;
++
++	if (!bb->unacked_exist)
++		return;
++
++	for (i = 0; i < bb->count ; i++) {
++		if (!BB_ACK(p[i])) {
++			unacked = true;
++			break;
++		}
++	}
++
++	if (!unacked)
++		bb->unacked_exist = 0;
++}
++
++/* Do exact work to set bad block range into the bad block table */
++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++			  int acknowledged)
++{
++	int retried = 0, space_desired = 0;
++	int orig_len, len = 0, added = 0;
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	sector_t orig_start;
++	unsigned long flags;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		/* round the start down, and the end up */
++		sector_t next = s + sectors;
++
++		rounddown(s, bb->shift);
++		roundup(next, bb->shift);
++		sectors = next - s;
++	}
++
++	write_seqlock_irqsave(&bb->lock, flags);
++
++	orig_start = s;
++	orig_len = sectors;
++	bad.ack = acknowledged;
++	p = bb->page;
++
++re_insert:
++	bad.start = s;
++	bad.len = sectors;
++	len = 0;
++
++	if (badblocks_empty(bb)) {
++		len = insert_at(bb, 0, &bad);
++		bb->count++;
++		added++;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start before all badblocks */
++	if (prev < 0) {
++		if (!badblocks_full(bb)) {
++			/* insert on the first */
++			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
++				bad.len = BB_OFFSET(p[0]) - bad.start;
++			len = insert_at(bb, 0, &bad);
++			bb->count++;
++			added++;
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* No sapce, try to merge */
++		if (overlap_behind(bb, &bad, 0)) {
++			if (can_merge_behind(bb, &bad, 0)) {
++				len = behind_merge(bb, &bad, 0);
++				added++;
++			} else {
++				len = min_t(sector_t,
++					    BB_OFFSET(p[0]) - s, sectors);
++				space_desired = 1;
++			}
++			hint = 0;
++			goto update_sectors;
++		}
++
++		/* no table space and give up */
++		goto out;
++	}
++
++	/* in case p[prev-1] can be merged with p[prev] */
++	if (can_combine_front(bb, prev, &bad)) {
++		front_combine(bb, prev);
++		bb->count--;
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (can_merge_front(bb, prev, &bad)) {
++			len = front_merge(bb, prev, &bad);
++			added++;
++		} else {
++			int extra = 0;
++
++			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
++				len = min_t(sector_t,
++					    BB_END(p[prev]) - s, sectors);
++				hint = prev;
++				goto update_sectors;
++			}
++
++			len = front_overwrite(bb, prev, &bad, extra);
++			added++;
++			bb->count += extra;
++
++			if (can_combine_front(bb, prev, &bad)) {
++				front_combine(bb, prev);
++				bb->count--;
++			}
++		}
++		hint = prev;
++		goto update_sectors;
++	}
++
++	if (can_merge_front(bb, prev, &bad)) {
++		len = front_merge(bb, prev, &bad);
++		added++;
++		hint = prev;
++		goto update_sectors;
++	}
++
++	/* if no space in table, still try to merge in the covered range */
++	if (badblocks_full(bb)) {
++		/* skip the cannot-merge range */
++		if (((prev + 1) < bb->count) &&
++		    overlap_behind(bb, &bad, prev + 1) &&
++		    ((s + sectors) >= BB_END(p[prev + 1]))) {
++			len = BB_END(p[prev + 1]) - s;
++			hint = prev + 1;
++			goto update_sectors;
++		}
++
++		/* no retry any more */
++		len = sectors;
++		space_desired = 1;
++		hint = -1;
++		goto update_sectors;
++	}
++
++	/* cannot merge and there is space in bad table */
++	if ((prev + 1) < bb->count &&
++	    overlap_behind(bb, &bad, prev + 1))
++		bad.len = min_t(sector_t,
++				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
++
++	len = insert_at(bb, prev + 1, &bad);
++	bb->count++;
++	added++;
++	hint = prev + 1;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_insert;
++
++	WARN_ON(sectors < 0);
++
++	/* Check whether the following already set range can be merged */
++	if ((prev + 1) < bb->count &&
++	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
++	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
++	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
++		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
++				  BB_ACK(p[prev]));
++
++		if ((prev + 2) < bb->count)
++			memmove(p + prev + 1, p + prev + 2,
++				(bb->count -  (prev + 2)) * 8);
++		bb->count--;
++	}
++
++	if (space_desired && !badblocks_full(bb)) {
++		s = orig_start;
++		sectors = orig_len;
++		space_desired = 0;
++		if (retried++ < 3)
++			goto re_insert;
++	}
++
++out:
++	if (added) {
++		set_changed(bb);
++
++		if (!acknowledged)
++			bb->unacked_exist = 1;
++		else
++			badblocks_update_acked(bb);
++	}
++
++	write_sequnlock_irqrestore(&bb->lock, flags);
++
++	if (!added)
++		rv = 1;
++
++	return rv;
++}
++
++/*
++ * Clear the bad block range from bad block table which is front overlapped
++ * with the clearing range. The return value is how many sectors from an
++ * already set bad block range are cleared. If the whole bad block range is
++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
++ * the caller to reduce bb->count.
++ */
++static int front_clear(struct badblocks *bb, int prev,
++		       struct badblocks_context *bad, int *deleted)
++{
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++	u64 *p = bb->page;
++	int cleared = 0;
++
++	*deleted = 0;
++	if (s == BB_OFFSET(p[prev])) {
++		if (BB_LEN(p[prev]) > sectors) {
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
++					  BB_LEN(p[prev]) - sectors,
++					  BB_ACK(p[prev]));
++			cleared = sectors;
++		} else {
++			/* BB_LEN(p[prev]) <= sectors */
++			cleared = BB_LEN(p[prev]);
++			if ((prev + 1) < bb->count)
++				memmove(p + prev, p + prev + 1,
++				       (bb->count - prev - 1) * 8);
++			*deleted = 1;
++		}
++	} else if (s > BB_OFFSET(p[prev])) {
++		if (BB_END(p[prev]) <= (s + sectors)) {
++			cleared = BB_END(p[prev]) - s;
++			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++					  s - BB_OFFSET(p[prev]),
++					  BB_ACK(p[prev]));
++		} else {
++			/* Splitting is handled in front_splitting_clear() */
++			BUG();
++		}
++	}
++
++	return cleared;
++}
++
++/*
++ * Handle the condition that the clearing range hits middle of an already set
++ * bad block range from bad block table. In this condition the existing bad
++ * block range is split into two after the middle part is cleared.
++ */
++static int front_splitting_clear(struct badblocks *bb, int prev,
++				  struct badblocks_context *bad)
++{
++	u64 *p = bb->page;
++	u64 end = BB_END(p[prev]);
++	int ack = BB_ACK(p[prev]);
++	sector_t sectors = bad->len;
++	sector_t s = bad->start;
++
++	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
++			  s - BB_OFFSET(p[prev]),
++			  ack);
++	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
++	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
++	return sectors;
++}
++
++/* Do the exact work to clear bad block range from the bad block table */
++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++	struct badblocks_context bad;
++	int prev = -1, hint = -1;
++	int len = 0, cleared = 0;
++	int rv = 0;
++	u64 *p;
++
++	if (bb->shift < 0)
++		/* badblocks are disabled */
++		return 1;
++
++	if (sectors == 0)
++		/* Invalid sectors number */
++		return 1;
++
++	if (bb->shift) {
++		sector_t target;
++
++		/* When clearing we round the start up and the end down.
++		 * This should not matter as the shift should align with
++		 * the block size and no rounding should ever be needed.
++		 * However it is better the think a block is bad when it
++		 * isn't than to think a block is not bad when it is.
++		 */
++		target = s + sectors;
++		roundup(s, bb->shift);
++		rounddown(target, bb->shift);
++		sectors = target - s;
++	}
++
++	write_seqlock_irq(&bb->lock);
++
++	bad.ack = true;
++	p = bb->page;
++
++re_clear:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* Start before all badblocks */
++	if (prev < 0) {
++		if (overlap_behind(bb, &bad, 0)) {
++			len = BB_OFFSET(p[0]) - s;
++			hint = prev;
++		} else {
++			len = sectors;
++		}
++		/*
++		 * Both situations are to clear non-bad range,
++		 * should be treated as successful
++		 */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Clear will split a bad record but the table is full */
++	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
++	    (BB_END(p[prev]) > (bad.start + sectors))) {
++		len = sectors;
++		printf("Warn: no space to split for clear\n");
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if ((BB_OFFSET(p[prev]) < bad.start) &&
++		    (BB_END(p[prev]) > (bad.start + bad.len))) {
++			/* Splitting */
++			if ((bb->count + 1) < MAX_BADBLOCKS) {
++				len = front_splitting_clear(bb, prev, &bad);
++				bb->count += 1;
++				cleared++;
++			} else {
++				/* No space to split, give up */
++				printf("Warn: no space to split for clear\n");
++				len = sectors;
++			}
++		} else {
++			int deleted = 0;
++
++			len = front_clear(bb, prev, &bad, &deleted);
++			bb->count -= deleted;
++			cleared++;
++			hint = prev;
++		}
++
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		/* Clear non-bad range should be treated as successful */
++		cleared++;
++		goto update_sectors;
++	}
++
++	/* Not cover any badblocks range in the table */
++	len = sectors;
++	/* Clear non-bad range should be treated as successful */
++	cleared++;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_clear;
++
++	WARN_ON(sectors < 0);
++
++	if (cleared) {
++		badblocks_update_acked(bb);
++		set_changed(bb);
++	}
++
++	write_sequnlock_irq(&bb->lock);
++
++	if (!cleared)
++		rv = 1;
++
++	return rv;
++}
++
++/* Do the exact work to check bad blocks range from the bad block table */
++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++			    sector_t *first_bad, int *bad_sectors)
++{
++	int unacked_badblocks, acked_badblocks;
++	int prev = -1, hint = -1, set = 0;
++	struct badblocks_context bad;
++	unsigned int seq;
++	int len, rv;
++	u64 *p;
++
++	WARN_ON(bb->shift < 0 || sectors == 0);
++
++	if (bb->shift > 0) {
++		sector_t target;
++
++		/* round the start down, and the end up */
++		target = s + sectors;
++		rounddown(s, bb->shift);
++		roundup(target, bb->shift);
++		sectors = target - s;
++	}
++
++retry:
++	seq = read_seqbegin(&bb->lock);
++
++	p = bb->page;
++	unacked_badblocks = 0;
++	acked_badblocks = 0;
++
++re_check:
++	bad.start = s;
++	bad.len = sectors;
++
++	if (badblocks_empty(bb)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	prev = prev_badblocks(bb, &bad, hint);
++
++	/* start after all badblocks */
++	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
++		len = sectors;
++		goto update_sectors;
++	}
++
++	if (overlap_front(bb, prev, &bad)) {
++		if (BB_ACK(p[prev]))
++			acked_badblocks++;
++		else
++			unacked_badblocks++;
++
++		if (BB_END(p[prev]) >= (s + sectors))
++			len = sectors;
++		else
++			len = BB_END(p[prev]) - s;
++
++		if (set == 0) {
++			*first_bad = BB_OFFSET(p[prev]);
++			*bad_sectors = BB_LEN(p[prev]);
++			set = 1;
++		}
++		goto update_sectors;
++	}
++
++	/* Not front overlap, but behind overlap */
++	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
++		len = BB_OFFSET(p[prev + 1]) - bad.start;
++		hint = prev + 1;
++		goto update_sectors;
++	}
++
++	/* not cover any badblocks range in the table */
++	len = sectors;
++
++update_sectors:
++	s += len;
++	sectors -= len;
++
++	if (sectors > 0)
++		goto re_check;
++
++	WARN_ON(sectors < 0);
++
++	if (unacked_badblocks > 0)
++		rv = -1;
++	else if (acked_badblocks > 0)
++		rv = 1;
++	else
++		rv = 0;
++
++	if (read_seqretry(&bb->lock, seq))
++		goto retry;
++
++	return rv;
++}
++
++/**
++ * badblocks_check() - check a given range for bad sectors
++ * @bb:		the badblocks structure that holds all badblock information
++ * @s:		sector (start) at which to check for badblocks
++ * @sectors:	number of sectors to check for badblocks
++ * @first_bad:	pointer to store location of the first badblock
++ * @bad_sectors: pointer to store number of badblocks after @first_bad
++ *
++ * We can record which blocks on each device are 'bad' and so just
++ * fail those blocks, or that stripe, rather than the whole device.
++ * Entries in the bad-block table are 64bits wide.  This comprises:
++ * Length of bad-range, in sectors: 0-511 for lengths 1-512
++ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
++ *  A 'shift' can be set so that larger blocks are tracked and
++ *  consequently larger devices can be covered.
++ * 'Acknowledged' flag - 1 bit. - the most significant bit.
++ *
++ * Locking of the bad-block table uses a seqlock so badblocks_check
++ * might need to retry if it is very unlucky.
++ * We will sometimes want to check for bad blocks in a bi_end_io function,
++ * so we use the write_seqlock_irq variant.
++ *
++ * When looking for a bad block we specify a range and want to
++ * know if any block in the range is bad.  So we binary-search
++ * to the last range that starts at-or-before the given endpoint,
++ * (or "before the sector after the target range")
++ * then see if it ends after the given start.
++ *
++ * Return:
++ *  0: there are no known bad blocks in the range
++ *  1: there are known bad block which are all acknowledged
++ * -1: there are bad blocks which have not yet been acknowledged in metadata.
++ * plus the start/length of the first bad section we overlap.
++ */
++int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
++			sector_t *first_bad, int *bad_sectors)
++{
++	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
++}
++EXPORT_SYMBOL_GPL(badblocks_check);
++
++/**
++ * badblocks_set() - Add a range of bad blocks to the table.
++ * @bb:		the badblocks structure that holds all badblock information
++ * @s:		first sector to mark as bad
++ * @sectors:	number of sectors to mark as bad
++ * @acknowledged: weather to mark the bad sectors as acknowledged
++ *
++ * This might extend the table, or might contract it if two adjacent ranges
++ * can be merged. We binary-search to find the 'insertion' point, then
++ * decide how best to handle it.
++ *
++ * Return:
++ *  0: success
++ *  1: failed to set badblocks (out of space)
++ */
++int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
++			int acknowledged)
++{
++	return _badblocks_set(bb, s, sectors, acknowledged);
++}
++EXPORT_SYMBOL_GPL(badblocks_set);
++
++/**
++ * badblocks_clear() - Remove a range of bad blocks to the table.
++ * @bb:		the badblocks structure that holds all badblock information
++ * @s:		first sector to mark as bad
++ * @sectors:	number of sectors to mark as bad
++ *
++ * This may involve extending the table if we spilt a region,
++ * but it must not fail.  So if the table becomes full, we just
++ * drop the remove request.
++ *
++ * Return:
++ *  0: success
++ *  1: failed to clear badblocks
++ */
++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
++{
++	return _badblocks_clear(bb, s, sectors);
++}
++EXPORT_SYMBOL_GPL(badblocks_clear);
++
++/**
++ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
++ * @bb:		the badblocks structure that holds all badblock information
++ *
++ * This only succeeds if ->changed is clear.  It is used by
++ * in-kernel metadata updates
++ */
++void ack_all_badblocks(struct badblocks *bb)
++{
++	if (bb->page == NULL || bb->changed)
++		/* no point even trying */
++		return;
++	write_seqlock_irq(&bb->lock);
++
++	if (bb->changed == 0 && bb->unacked_exist) {
++		u64 *p = bb->page;
++		int i;
++
++		for (i = 0; i < bb->count ; i++) {
++			if (!BB_ACK(p[i])) {
++				sector_t start = BB_OFFSET(p[i]);
++				int len = BB_LEN(p[i]);
++
++				p[i] = BB_MAKE(start, len, 1);
++			}
++		}
++		bb->unacked_exist = 0;
++	}
++	write_sequnlock_irq(&bb->lock);
++}
++EXPORT_SYMBOL_GPL(ack_all_badblocks);
++
++/**
++ * badblocks_show() - sysfs access to bad-blocks list
++ * @bb:		the badblocks structure that holds all badblock information
++ * @page:	buffer received from sysfs
++ * @unack:	weather to show unacknowledged badblocks
++ *
++ * Return:
++ *  Length of returned data
++ */
++ssize_t badblocks_show(struct badblocks *bb, int unack)
++{
++	size_t len;
++	int i;
++	u64 *p = bb->page;
++	char * _page;
++	int size = 64*4096;
++	unsigned seq;
++
++	if (bb->shift < 0)
++		return 0;
++
++	_page = malloc(size);
++	if (!_page) {
++		printf("alloc _page failed\n");
++		return 0;
++	}
++	memset(_page, 0, size);
++retry:
++	seq = read_seqbegin(&bb->lock);
++
++	len = 0;
++	i = 0;
++
++	while (len < size&& i < bb->count) {
++		sector_t s = BB_OFFSET(p[i]);
++		unsigned int length = BB_LEN(p[i]);
++		int ack = BB_ACK(p[i]);
++
++		i++;
++
++		if (unack && ack)
++			continue;
++
++		len += snprintf(_page+len, size - len, "%llu %u\n",
++				(unsigned long long)s << bb->shift,
++				length << bb->shift);
++	}
++	if (unack && len == 0)
++		bb->unacked_exist = 0;
++
++	printf("%s\n", _page);
++	free(_page);
++
++	if (read_seqretry(&bb->lock, seq))
++		goto retry;
++
++	return len;
++}
++EXPORT_SYMBOL_GPL(badblocks_show);
++
++/**
++ * badblocks_store() - sysfs access to bad-blocks list
++ * @bb:		the badblocks structure that holds all badblock information
++ * @page:	buffer received from sysfs
++ * @len:	length of data received from sysfs
++ * @unack:	weather to show unacknowledged badblocks
++ *
++ * Return:
++ *  Length of the buffer processed or -ve error.
++ */
++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
++			int unack)
++{
++	unsigned long long sector;
++	int length;
++	char newline;
++
++	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
++	case 3:
++		if (newline != '\n')
++			return -EINVAL;
++		fallthrough;
++	case 2:
++		if (length <= 0)
++			return -EINVAL;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	if (badblocks_set(bb, sector, length, !unack))
++		return -ENOSPC;
++	else
++		return len;
++}
++EXPORT_SYMBOL_GPL(badblocks_store);
++
++static int __badblocks_init(struct device *dev, struct badblocks *bb,
++		int enable)
++{
++	bb->dev = dev;
++	bb->count = 0;
++	if (enable)
++		bb->shift = 0;
++	else
++		bb->shift = -1;
++	if (dev)
++		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
++	else
++		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
++	if (!bb->page) {
++		bb->shift = -1;
++		return -ENOMEM;
++	}
++	seqlock_init(&bb->lock);
++
++	return 0;
++}
++
++/**
++ * badblocks_init() - initialize the badblocks structure
++ * @bb:		the badblocks structure that holds all badblock information
++ * @enable:	weather to enable badblocks accounting
++ *
++ * Return:
++ *  0: success
++ *  -ve errno: on error
++ */
++int badblocks_init(struct badblocks *bb, int enable)
++{
++	return __badblocks_init(NULL, bb, enable);
++}
++EXPORT_SYMBOL_GPL(badblocks_init);
++
++int devm_init_badblocks(struct device *dev, struct badblocks *bb)
++{
++	if (!bb)
++		return -EINVAL;
++	return __badblocks_init(dev, bb, 1);
++}
++EXPORT_SYMBOL_GPL(devm_init_badblocks);
++
++/**
++ * badblocks_exit() - free the badblocks structure
++ * @bb:		the badblocks structure that holds all badblock information
++ */
++void badblocks_exit(struct badblocks *bb)
++{
++	if (!bb)
++		return;
++	if (bb->dev)
++		devm_kfree(bb->dev, bb->page);
++	else
++		kfree(bb->page);
++	bb->page = NULL;
++}
++EXPORT_SYMBOL_GPL(badblocks_exit);
++
++
++/*
++ * Test case related
++ */
++char good_sector[512];
++char bad_unack_sector[512];
++char bad_acked_sector[512];
++
++#define BB_SET	0
++#define BB_CLN	1
++
++unsigned rand_seed = 2;
++
++char bb_ops[] = {0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1};
++char bb_ack[] = {1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0};
++
++/* disk file lengh is 256MB */
++#define DISKFILE_SECTORS	((256 << 20) >> 9)
++#define MAX_SET_SIZE		(DISKFILE_SECTORS/256)
++#define MAX_CLN_SIZE		(DISKFILE_SECTORS/1024)
++
++#define BUF_LEN	(8<<10)
++
++void write_badblocks_log(struct badblocks *bb, char *dir, unsigned long seq,
++			 sector_t bb_start, sector_t bb_len,
++			 int ops, int ack)
++{
++	char path[512];
++	char buf[8192];
++	u64 *p = bb->page;
++	int len, size, i;
++	int fd;
++
++
++	size = sizeof(buf);
++	memset(buf, 0, sizeof(buf));
++	len = 0;
++
++	len += snprintf(buf + len, size - len, "============ %lu ============\n\n", seq);
++	if (ops == BB_SET)
++		len += snprintf(buf + len, size - len, "set: start %llu, len %llu, ack %d\n",
++				bb_start, bb_len, ack);
++	else
++		len += snprintf(buf + len, size - len, "clear: start %llu, len %llu\n",
++				bb_start, bb_len);
++
++	len += snprintf(buf + len, size - len, "=============================\n\n");
++
++	i = 0;
++	while (len < size && i < bb->count) {
++		sector_t s = BB_OFFSET(p[i]);
++		unsigned int length = BB_LEN(p[i]);
++		int ack = BB_ACK(p[i]);
++
++		i++;
++
++		len += snprintf(buf + len, size - len, "%llu %u [%u]\n",
++				(unsigned long long)s << bb->shift,
++				length << bb->shift,
++				ack);
++	}
++
++	snprintf(path, 512, "%s/seq-%.8lu", dir ? dir : ".", seq);
++	unlink(path);
++	fd = open(path, O_CREAT|O_RDWR, 0644);
++	if (fd < 0) {
++		printf("fail to create file %s\n", path);
++		return;
++	}
++	write(fd, buf, len);
++	fsync(fd);
++	close(fd);
++}
++
++
++int verify_bad_sectors(sector_t start, sector_t len, int expected, int fd)
++{
++	int ret = 0;
++	char buf[BUF_LEN];
++	unsigned long offset = start << 9;
++	unsigned long unread = len << 9;
++
++	if ((start + len) > DISKFILE_SECTORS)
++		printf("Error: invalid verify range: s %llu, l %llu\n, limit %u\n",
++		       start, len, DISKFILE_SECTORS);
++
++	while (unread > 0) {
++		unsigned long read_bytes = min(unread, BUF_LEN);
++		unsigned long i;
++		ssize_t _ret;
++
++		memset(buf, 0, sizeof(buf));
++		_ret = pread(fd, buf, read_bytes, offset);
++		if (_ret != read_bytes) {
++			printf("Error: to read %lu bytes, return %lu bytes\n",
++			       read_bytes, _ret);
++		}
++
++		for (i = 0; i < read_bytes; i++) {
++			if (buf[i] != expected) {
++				printf("Unexpected sector value %u (should be %u) at sector %lu"
++				       " offset byte %lu\n",
++				       buf[i], expected, (offset+i) >> 9,
++				       (offset + i) % 512);
++				exit(1);
++				if (ret == 0)
++					ret = -EIO;
++			}
++		}
++
++		if (ret)
++			goto out;
++
++		unread -= read_bytes;
++		offset += read_bytes;
++	}
++
++out:
++	return ret;
++}
++
++int verify_badblocks_file(struct badblocks *bb, int fd, unsigned long seq)
++{
++	int ret = 0;
++	sector_t size = DISKFILE_SECTORS;
++	u64 *p = bb->page;
++	int i = 0;
++	unsigned long prev_pos, pos;
++
++	prev_pos = pos = 0;
++	while ((size > 0) && (i < bb->count)) {
++		sector_t s = BB_OFFSET(p[i]);
++		unsigned int length = BB_LEN(p[i]);
++		int ack = BB_ACK(p[i]);
++
++		pos = s;
++
++		/* verify non-bad area */
++		if (pos > prev_pos) {
++			ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd);
++			if (ret < 0) {
++				printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n",
++				      __func__, __LINE__, prev_pos, pos, strerror(-ret));
++				goto out;
++			}
++
++			size -= (pos - prev_pos);
++		}
++
++		/* verify bad area */
++		ret = verify_bad_sectors(pos, length, ack ? 2 : 1, fd);
++		if (ret < 0) {
++			printf("%s:%d fail to verify bad sectors [%lu, %u) ack %d, error: %s\n",
++			       __func__, __LINE__, pos, length, ack, strerror(ret));
++			goto out;
++		}
++
++		size -= length;
++		i++;
++		prev_pos = pos + length;
++	}
++
++	if (i < bb->count) {
++		printf("Error: total %d bad records, verified %d, left %d\n",
++		       bb->count, i, bb->count - i);
++		if (size)
++			printf("Error: still have %llu sectors not verified\n",
++			       size);
++		ret = -EIO;
++		goto out;
++	}
++
++	/* verify rested non-bad area */
++	if (size) {
++		pos = DISKFILE_SECTORS;
++		ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd);
++		if (ret < 0) {
++			printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n",
++			      __func__, __LINE__, prev_pos, pos, strerror(-ret));
++			goto out;
++		}
++	}
++
++	printf("verify badblocks file successfully (seq %lu)\n", seq);
++out:
++	return ret;
++}
++
++
++int _write_diskfile(int fd, int ops,
++		    sector_t start, sector_t len, int ack)
++{
++	off_t pos = start << 9;
++	char sector[512];
++
++	if ((start + len) > DISKFILE_SECTORS)
++		len = DISKFILE_SECTORS - start;
++
++	if (len == 0) {
++		printf("Error: write diskfile zero-length at %llu len %llu\n",
++		       start, len);
++		return -EINVAL;
++	}
++
++	if (ops == BB_CLN) {
++		while (len > 0) {
++			pwrite(fd, good_sector, 512, pos);
++			pos += 512;
++			len--;
++		}
++		fsync(fd);
++		return 0;
++	}
++
++	/* badblocks set */
++	while (len > 0) {
++		pread(fd, sector, 512, pos);
++		if (!memcmp(sector, good_sector, 512)) {
++			if (ack)
++				pwrite(fd, bad_acked_sector, 512, pos);
++			else
++				pwrite(fd, bad_unack_sector, 512, pos);
++
++//			printf("write %d at sector %lu\n", ack ? 2 : 1, pos >> 9);
++		} else if (!memcmp(sector, bad_unack_sector, 512)) {
++			if (ack) {
++				pwrite(fd, bad_acked_sector, 512, pos);
++//				printf("overwrite 2 at unack sector %lu\n", pos >> 9);
++			} else {
++//				printf("avoid overwrite already unacked sector %lu\n", pos >> 9);
++			}
++		} else if (!memcmp(sector, bad_acked_sector, 512)) {
++//			if (ack)
++//				printf("avoid overwrite already acked sector %lu\n", pos >> 9);
++//			else
++//				printf("cannot overwrite acked sector %lu\n", pos >> 9);
++		} else {
++			printf("Error: unexpected sector at %lu\n", pos >> 9);
++		}
++
++		pos += 512;
++		len--;
++	}
++
++	fsync(fd);
++	return 0;
++}
++
++sector_t fix_writing_length(struct badblocks*bb, int ops, sector_t bb_start,
++			    sector_t bb_len, int ack)
++{
++	sector_t orig_len = bb_len;
++	sector_t ret_len = 0;
++	int prev;
++	struct badblocks_context bad;
++	u64 *p = bb->page;
++
++	bad.orig_start = bb_start;
++	bad.orig_len = bb_len;
++	bad.start = bb_start;
++	bad.len = bb_len;
++	bad.ack = ack;
++
++
++	if (ops == BB_SET) {
++		prev = prev_badblocks(bb, &bad, -1);
++		if (prev < 0) {
++			printf("Unexpected: the set range is not in badblocks table\n");
++			exit(1);
++		}
++
++		if (BB_OFFSET(p[prev]) > bb_start ||
++		    BB_END(p[prev]) <= bb_start ||
++		    BB_ACK(p[prev]) != ack) {
++			printf("Unexpected: fixing range is not in badblocks table\n");
++			exit(1);
++		}
++
++		while (bb_len > 0) {
++			int seg;
++
++			if (BB_END(p[prev]) >= (bb_start + bb_len))
++				seg = bb_len;
++			else
++				seg = BB_END(p[prev]) - bb_start;
++
++			ret_len += seg;
++			bb_start += seg;
++			bb_len -= seg;
++
++			if (bb_len == 0)
++				break;
++
++			if ((prev + 1) >= bb->count ||
++			    BB_END(p[prev]) != BB_OFFSET(p[prev + 1]) ||
++			    BB_ACK(p[prev]) != BB_ACK(p[prev + 1]))
++				break;
++			prev++;
++		}
++	} else if (ops == BB_CLN) {
++		ret_len = bb_len;
++
++	}
++
++	printf("Fix writing bb_len from %llu to %llu\n", orig_len, ret_len);
++	return ret_len;
++}
++
++int write_badblocks_file(struct badblocks *bb, unsigned long seq, int fd)
++{
++	int ret;
++	sector_t bb_start, bb_len;
++	int ops, random;
++
++retry:
++	random = rand_r(&rand_seed);
++	ops = bb_ops[random % sizeof(bb_ops)];
++	random = rand_r(&rand_seed);
++	if (ops == BB_SET)
++		bb_len = random % MAX_SET_SIZE;
++	else
++		bb_len= random % MAX_CLN_SIZE;
++	random = rand_r(&rand_seed);
++	bb_start = random % DISKFILE_SECTORS;
++	if ((bb_start + bb_len) > DISKFILE_SECTORS)
++		bb_len = DISKFILE_SECTORS - bb_start;
++	if (bb_len == 0) {
++		printf("random bb_len is 0, re-generate\n");
++		goto retry;
++	}
++
++
++	if (ops == BB_SET) {
++		int ack;
++
++		random = rand_r(&rand_seed);
++		ack = bb_ack[random % sizeof(bb_ack)];
++
++		bb->changed = 0;
++		ret = badblocks_set(bb, bb_start, bb_len, ack);
++		write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_SET, ack);
++		if (ret > 0) {
++			printf("NOTICE: no space or cannot overwwrite badblocks"
++			       "        for badblocks_set(s: %llu, l: %llu, a: %d).\n"
++			       "        Manual check might be necessary if\n"
++			       "        following verification failed.\n",
++			       bb_start, bb_len, ack);
++			return 1;
++		}
++
++		if (badblocks_full(bb) && bb->changed)
++			bb_len = fix_writing_length(bb, ops, bb_start, bb_len, ack);
++		ret = _write_diskfile(fd, ops, bb_start, bb_len, ack);
++	} else {
++		bb->changed = 0;
++		ret = badblocks_clear(bb, bb_start, bb_len);
++		write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_CLN, -1);
++		if (ret > 0) {
++			printf("NOTICE: no space for badblocks_clear(s: %llu, l: %llu)\n"
++			       "        Manual check might be necessary if\n"
++			       "        following verification failed.\n",
++			       bb_start, bb_len);
++			return 1;
++		}
++
++		ret = _write_diskfile(fd, ops, bb_start, bb_len, -1);
++	}
++
++	return ret;
++}
++
++#define MAX_BB_TEST_TRIES	(1<<20)
++int do_test(struct badblocks *bb)
++{
++	int ret = 0;
++	unsigned long seq;
++	char diskfile_name[] = "./dummy_disk_file";
++	int diskfile_fd = -1;
++
++	srand(rand_seed);
++
++	unlink(diskfile_name);
++	diskfile_fd = open(diskfile_name, O_CREAT|O_RDWR, 0644);
++	if (diskfile_fd < 0) {
++		printf("fail to create %s, error %s\n",
++		       diskfile_name, strerror(errno));
++		goto out;
++	}
++	ret = fallocate(diskfile_fd, FALLOC_FL_ZERO_RANGE, 0, DISKFILE_SECTORS << 9);
++	if (ret < 0) {
++		printf("fail to allocate zero-filled file, error %s\n",
++		       strerror(errno));
++		goto out;
++	}
++
++	for (seq = 1; seq <= MAX_BB_TEST_TRIES; seq++) {
++		ret = write_badblocks_file(bb, seq, diskfile_fd);
++		if (ret < 0) {
++			printf("fail to generate bad blocks for seq %lu, error %s\n",
++			       seq, strerror(-ret));
++			goto out;
++		}
++		ret = verify_badblocks_file(bb, diskfile_fd, seq);
++		if (ret < 0) {
++			printf("fail to verify bad blocks for seq %lu, error %s\n",
++			       seq, strerror(-ret));
++		}
++	}
++
++out:
++	if (diskfile_fd >= 0)
++		close(diskfile_fd);
++	return ret;
++}
++
++int main(int argc, char *argv[])
++{
++        struct badblocks bblocks;
++        struct badblocks *bb = &bblocks;
++	int i;
++
++	for (i = 0; i < 512; i++) {
++		good_sector[i] = 0;
++		bad_unack_sector[i] = 1;
++		bad_acked_sector[i] = 2;
++	}
++
++	memset(bb, 0, sizeof(struct badblocks));
++	badblocks_init(bb, 1);
++
++	do_test(bb);
++
++	badblocks_exit(bb);
++        return 0;
++}
+-- 
+2.31.1
+
diff --git a/for-test/jouranl-deadlock/0001-reserve-journal-space.patch b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch
new file mode 100644
index 0000000..81af639
--- /dev/null
+++ b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch
@@ -0,0 +1,369 @@
+From 120572550c913abcc1054912c8deb29c690ffe93 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 18 Apr 2022 21:55:37 +0800
+Subject: [PATCH 1/2] reserve journal space
+
+---
+ drivers/md/bcache/journal.c | 220 +++++++++++++++++++++++++++++++++---
+ drivers/md/bcache/journal.h |  10 ++
+ 2 files changed, 214 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 7c2ca52ca3e4..5aac20c71b80 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -166,6 +166,85 @@ reread:		left = ca->sb.bucket_size - offset;
+ 	return ret;
+ }
+ 
++static int bch_journal_key_reload(struct cache_set *c)
++{
++	struct cache *ca = c->cache;
++	struct bkey *k = &c->journal.key;
++	struct journal_device *ja = &ca->journal;
++	struct bio *bio = &ja->bio;
++	struct jset *j, *data = c->journal.w[0].data;
++	unsigned int n = 0, offset = 0, used_blocks = 0;
++	unsigned int len, left;
++	sector_t bucket;
++	struct closure cl;
++	int ret = 0;
++
++	/* load from the latest journal bucket */
++	bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]);
++
++	closure_init_stack(&cl);
++
++	while (offset < ca->sb.bucket_size) {
++reread:
++		left = ca->sb.bucket_size - offset;
++		len = min_t(unsigned int,
++			    left, PAGE_SECTORS << JSET_BITS);
++
++		bio_reset(bio, ca->bdev, REQ_OP_READ);
++		bio->bi_iter.bi_sector = bucket + offset;
++		bio->bi_iter.bi_size = len << 9;
++
++		bio->bi_end_io = journal_read_endio;
++		bio->bi_private = &cl;
++		bch_bio_map(bio, data);
++
++		closure_bio_submit(c, bio, &cl);
++		closure_sync(&cl);
++
++		j = data;
++		while (len) {
++			size_t blocks, bytes = set_bytes(j);
++
++			if (j->magic != jset_magic(&ca->sb))
++				goto out;
++
++			if (bytes > left << 9 ||
++			    bytes > PAGE_SIZE << JSET_BITS) {
++				pr_err("jset may be correpted: too big");
++				ret = -EIO;
++				goto err;
++			}
++
++			if (bytes > len << 9)
++				goto reread;
++
++			if (j->csum != csum_set(j)) {
++				pr_err("jset may be corrupted: bad csum");
++				ret = -EIO;
++				goto err;
++			}
++
++			blocks = set_blocks(j, block_bytes(ca));
++			used_blocks += blocks;
++
++			offset  += blocks * ca->sb.block_size;
++			len     -= blocks * ca->sb.block_size;
++			j = ((void *) j) + blocks * block_bytes(ca);
++		}
++	}
++out:
++	c->journal.blocks_free =
++		(ca->sb.bucket_size >> c->block_bits) - used_blocks;
++
++	k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev);
++
++	bkey_init(k);
++	SET_KEY_PTRS(k, n);
++
++err:
++	return ret;
++}
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b)							\
+@@ -279,13 +358,23 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ 
+ 		}
+ 
++	if (c->journal.blocks_free != 0)
++		pr_warn("Unexpected blocks_free %u before reload journal key.\n",
++			c->journal.blocks_free);
++
++	ret = bch_journal_key_reload(c);
++
+ out:
+ 	if (!list_empty(list))
+ 		c->journal.seq = list_entry(list->prev,
+ 					    struct journal_replay,
+ 					    list)->j.seq;
+ 
+-	return 0;
++	/* Initial value of c->journal.blocks_free should be 0 */
++	BUG_ON(c->journal.blocks_free != 0);
++	ret = bch_journal_key_reload(c);
++
++	return ret;
+ #undef read_bucket
+ }
+ 
+@@ -355,6 +444,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+ 	struct keylist keylist;
+ 
++	/* Mark journal replay started */
++	s->journal.in_replay = true;
++
+ 	list_for_each_entry(i, list, list) {
+ 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
+ 
+@@ -396,6 +488,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ 	pr_info("journal replay done, %i keys in %i entries, seq %llu\n",
+ 		keys, entries, end);
+ err:
++	/* Mark journal replay finished */
++	s->journal.in_replay = false;
++
+ 	while (!list_empty(list)) {
+ 		i = list_first_entry(list, struct journal_replay, list);
+ 		list_del(&i->list);
+@@ -621,6 +716,18 @@ static void do_journal_discard(struct cache *ca)
+ 	}
+ }
+ 
++static inline bool last_writable_journal_bucket(struct cache_set *c)
++{
++	struct cache *ca = c->cache;
++	struct journal_device *ja = &ca->journal;
++
++	if (((ja->cur_idx + 1) % ca->sb.njournal_buckets) !=
++	    ja->last_idx)
++		return false;
++
++	return true;
++}
++
+ static void journal_reclaim(struct cache_set *c)
+ {
+ 	struct bkey *k = &c->journal.key;
+@@ -629,6 +736,8 @@ static void journal_reclaim(struct cache_set *c)
+ 	unsigned int next;
+ 	struct journal_device *ja = &ca->journal;
+ 	atomic_t p __maybe_unused;
++	bool is_last_valid;
++	bool journal_wakeup = true;
+ 
+ 	atomic_long_inc(&c->reclaim);
+ 
+@@ -646,13 +755,33 @@ static void journal_reclaim(struct cache_set *c)
+ 
+ 	do_journal_discard(ca);
+ 
+-	if (c->journal.blocks_free)
++	is_last_valid = last_writable_journal_bucket(c);
++
++	/*
++	 * This is not the last valid journal bucket, no need to worry
++	 * about the reserved journal space.
++	 */
++	if (!is_last_valid && c->journal.blocks_free)
++		goto out;
++
++	/*
++	 * this is the last valid journal bucket, if the free space is
++	 * larger than reserved sectors, no need to reclaim more journal
++	 * space. Otherwise must try to reclaim one more journal bucket,
++	 * to make sure there always are c->journal.reserved sectors
++	 * reserved for initialization time usage.
++	 */
++	if (is_last_valid &&
++	    (c->journal.blocks_free * c->cache->sb.block_size) >
++	     c->journal.reserved)
+ 		goto out;
+ 
+ 	next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ 	/* No space available on this device */
+-	if (next == ja->discard_idx)
++	if (next == ja->discard_idx) {
++		journal_wakeup = false;
+ 		goto out;
++	}
+ 
+ 	ja->cur_idx = next;
+ 	k->ptr[0] = MAKE_PTR(0,
+@@ -665,7 +794,7 @@ static void journal_reclaim(struct cache_set *c)
+ 	c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits;
+ 
+ out:
+-	if (!journal_full(&c->journal))
++	if (journal_wakeup)
+ 		__closure_wake_up(&c->journal.wait);
+ }
+ 
+@@ -825,6 +954,60 @@ static void journal_try_write(struct cache_set *c)
+ 	}
+ }
+ 
++static bool jset_space_available(struct cache_set *c, size_t sectors)
++{
++	size_t n, reserved;
++	bool last_writable_bucket;
++
++	n = min_t(size_t,
++		  c->journal.blocks_free * c->cache->sb.block_size,
++		  PAGE_SECTORS << JSET_BITS);
++
++	last_writable_bucket = last_writable_journal_bucket(c);
++
++	if (!last_writable_bucket || c->journal.in_replay)
++		reserved = 0;
++	else
++		reserved = c->journal.reserved;
++
++	if (sectors <= (n - reserved))
++		return true;
++
++	return false;
++}
++
++static bool journal_space_available(struct cache_set *c,
++				    unsigned int nkeys)
++{
++	/*
++	 * XXX: If we were inserting so many keys that they
++	 * won't fit in an _empty_ journal write, we'll
++	 * deadlock. For now, handle this in
++	 * bch_keylist_realloc() - but something to think about.
++	 */
++	if ((nkeys * sizeof(uint64_t)) >
++	    (block_bytes(c->cache) - sizeof(struct jset))) {
++		pr_err("The keys to insert is bigger than an empty journal write.\n");
++		pr_err("keys in current journal write: %u, keys to insert: %u\n",
++		       c->journal.cur->data->keys, nkeys);
++		BUG();
++	}
++
++	if (journal_full(&c->journal))
++		return false;
++
++	/*
++	 * Before flushing current write (without the inserting keys)
++	 * to get next empty write, it is still necessary to check
++	 * whether there is enough free blocks in current journal bucket
++	 * except for the reserved journal space.
++	 */
++	if (jset_space_available(c, 0))
++		return true;
++
++	return false;
++}
++
+ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ 						    unsigned int nkeys)
+ 	__acquires(&c->journal.lock)
+@@ -844,28 +1027,27 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ 				       block_bytes(ca)) * ca->sb.block_size;
+ 
+-		if (sectors <= min_t(size_t,
+-				     c->journal.blocks_free * ca->sb.block_size,
+-				     PAGE_SECTORS << JSET_BITS))
++		if (jset_space_available(c, sectors))
+ 			return w;
+ 
+ 		if (wait)
+ 			closure_wait(&c->journal.wait, &cl);
+ 
+-		if (!journal_full(&c->journal)) {
+-			if (wait)
+-				trace_bcache_journal_entry_full(c);
+-
++		if (journal_space_available(c, nkeys)) {
+ 			/*
+-			 * XXX: If we were inserting so many keys that they
+-			 * won't fit in an _empty_ journal write, we'll
+-			 * deadlock. For now, handle this in
+-			 * bch_keylist_realloc() - but something to think about.
++			 * Flush current non-empty write and try next
++			 * empty one updated by journal_write_unlocked().
+ 			 */
+-			BUG_ON(!w->data->keys);
++			if (wait)
++				trace_bcache_journal_entry_full(c);
+ 
+ 			journal_try_write(c); /* unlocks */
+ 		} else {
++			/*
++			 * No space to flush current write, try to reclaim
++			 * an empty journal bucket and do all things again
++			 * in next loop.
++			 */
+ 			if (wait)
+ 				trace_bcache_journal_full(c);
+ 
+@@ -974,5 +1156,11 @@ int bch_journal_alloc(struct cache_set *c)
+ 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
+ 		return -ENOMEM;
+ 
++	/* deside how many sectors reserved for jouranl replay */
++	if (JOURANL_RESERVE < c->cache->sb.bucket_size)
++		j->reserved = JOURANL_RESERVE;
++	else
++		j->reserved = c->cache->sb.bucket_size;
++
+ 	return 0;
+ }
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..bcaa4ce458ae 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -105,6 +105,7 @@ struct journal {
+ 	spinlock_t		lock;
+ 	spinlock_t		flush_write_lock;
+ 	bool			btree_flushing;
++
+ 	/* used when waiting because the journal was full */
+ 	struct closure_waitlist	wait;
+ 	struct closure		io;
+@@ -119,6 +120,8 @@ struct journal {
+ 	BKEY_PADDED(key);
+ 
+ 	struct journal_write	w[2], *cur;
++	bool			in_replay;
++	int			reserved;
+ };
+ 
+ /*
+@@ -161,6 +164,13 @@ struct journal_device {
+ #define journal_pin_cmp(c, l, r)				\
+ 	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+ 
++/*
++ * Reserve 2 pages space in case journal space is full during
++ * initialization and btree node split happens in journal reply.
++ * If JOURANL_RESERVE > bucket_size, then only reserve 1 bucket.
++ */
++#define JOURANL_RESERVE	(PAGE_SECTORS * 2)
++
+ #define JOURNAL_PIN	20000
+ 
+ #define journal_full(j)						\
+-- 
+2.34.1
+
diff --git a/for-test/jouranl-deadlock/0002-more-fixes.patch b/for-test/jouranl-deadlock/0002-more-fixes.patch
new file mode 100644
index 0000000..c51e16b
--- /dev/null
+++ b/for-test/jouranl-deadlock/0002-more-fixes.patch
@@ -0,0 +1,131 @@
+From df1c455f2b0877ca7dbcec7fa06a0aca8ed825d8 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 21 Apr 2022 16:12:53 +0800
+Subject: [PATCH 2/2] more fixes
+
+---
+ Makefile                    |  2 +-
+ drivers/md/bcache/journal.c | 15 ++++++++++-----
+ drivers/md/bcache/request.c |  2 +-
+ drivers/md/bcache/super.c   |  2 ++
+ drivers/md/bcache/util.c    | 10 ++++++++--
+ 5 files changed, 22 insertions(+), 9 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 29e273d3f8cc..3abbd83b337c 100644
+--- a/Makefile
++++ b/Makefile
+@@ -2,7 +2,7 @@
+ VERSION = 5
+ PATCHLEVEL = 18
+ SUBLEVEL = 0
+-EXTRAVERSION = -rc2
++EXTRAVERSION = -rc2-bcache-journal
+ NAME = Superb Owl
+ 
+ # *DOCUMENTATION*
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 5aac20c71b80..916141c69ec8 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -370,9 +370,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ 					    struct journal_replay,
+ 					    list)->j.seq;
+ 
+-	/* Initial value of c->journal.blocks_free should be 0 */
+-	BUG_ON(c->journal.blocks_free != 0);
+-	ret = bch_journal_key_reload(c);
++	if (c->journal.blocks_free == 0) {
++		pr_info("c->journal.blocks_free is 0, reload journal_key\n");
++		ret = bch_journal_key_reload(c);
++	}
+ 
+ 	return ret;
+ #undef read_bucket
+@@ -900,12 +901,12 @@ static void journal_write_unlocked(struct closure *cl)
+ 
+ 		bio_reset(bio, ca->bdev, REQ_OP_WRITE | 
+ 			  REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA);
+-		bch_bio_map(bio, w->data);
+ 		bio->bi_iter.bi_sector	= PTR_OFFSET(k, i);
+ 		bio->bi_iter.bi_size = sectors << 9;
+ 
+ 		bio->bi_end_io	= journal_write_endio;
+ 		bio->bi_private = w;
++		bch_bio_map(bio, w->data);
+ 
+ 		trace_bcache_journal_write(bio, w->data->keys);
+ 		bio_list_add(&list, bio);
+@@ -1002,9 +1003,12 @@ static bool journal_space_available(struct cache_set *c,
+ 	 * whether there is enough free blocks in current journal bucket
+ 	 * except for the reserved journal space.
+ 	 */
+-	if (jset_space_available(c, 0))
++	if (jset_space_available(c, 0)) {
++		pr_info("there is available jset space\n");
+ 		return true;
++	}
+ 
++	pr_info("NO available jset space\n");
+ 	return false;
+ }
+ 
+@@ -1027,6 +1031,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ 				       block_bytes(ca)) * ca->sb.block_size;
+ 
++		pr_info("sectors from __set_blocks(): %lu\n", sectors);
+ 		if (jset_space_available(c, sectors))
+ 			return w;
+ 
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index fdd0194f84dd..320fcdfef48e 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s,
+ {
+ 	struct bio *bio = &s->bio.bio;
+ 
+-	bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO);
++	bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO);
+ 	/*
+ 	 * bi_end_io can be set separately somewhere else, e.g. the
+ 	 * variants in,
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index bf3de149d3c9..efb9fae4354f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1077,7 +1077,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
+ 		closure_sync(&cl);
+ 	}
+ 
++	pr_info("call add_disk(), d->disk: 0x%pK\n", d->disk);
+ 	ret = add_disk(d->disk);
++	pr_info("return from add_disk(): %d\n", ret);
+ 	if (ret)
+ 		goto out;
+ 	bd_link_disk_holder(dc->bdev, dc->disk.disk);
+diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
+index ae380bc3992e..f3c8b7db43ef 100644
+--- a/drivers/md/bcache/util.c
++++ b/drivers/md/bcache/util.c
+@@ -233,8 +233,14 @@ void bch_bio_map(struct bio *bio, void *base)
+ 	size_t size = bio->bi_iter.bi_size;
+ 	struct bio_vec *bv = bio->bi_io_vec;
+ 
+-	BUG_ON(!bio->bi_iter.bi_size);
+-	BUG_ON(bio->bi_vcnt);
++	if (!bio->bi_iter.bi_size) {
++		pr_err("BUG: bio->bi_iter.bi_size is 0\n");
++		BUG_ON(!bio->bi_iter.bi_size);
++	}
++	if (bio->bi_vcnt) {
++		pr_err("BUG: bio->bi_vcnt: %u\n", bio->bi_vcnt);
++		BUG_ON(bio->bi_vcnt);
++	}
+ 
+ 	bv->bv_offset = base ? offset_in_page(base) : 0;
+ 	goto start;
+-- 
+2.34.1
+
diff --git a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
index cfe5323..cfe5323 100644
--- a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
diff --git a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
index 39b9873..39b9873 100644
--- a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
diff --git a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
index 07050e9..07050e9 100644
--- a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
author	Coly Li <colyli@suse.de>	2022-05-22 00:50:52 +0800
committer	Coly Li <colyli@suse.de>	2022-05-22 00:50:52 +0800
commit	41347a6d6406e1297ae11c7eb003c0b284a25720 (patch)
tree	8cb4c47e1ed9ed66babe5ebde0d684b4f0c92145
parent	995eb52153c879646c1dedb21ff4d2683aa4966d (diff)
download	bcache-patches-41347a6d6406e1297ae11c7eb003c0b284a25720.tar.gz