diff options
author | Coly Li <colyli@suse.de> | 2022-05-22 00:50:52 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2022-05-22 00:50:52 +0800 |
commit | 41347a6d6406e1297ae11c7eb003c0b284a25720 (patch) | |
tree | 8cb4c47e1ed9ed66babe5ebde0d684b4f0c92145 | |
parent | 995eb52153c879646c1dedb21ff4d2683aa4966d (diff) | |
download | bcache-patches-41347a6d6406e1297ae11c7eb003c0b284a25720.tar.gz |
update for-test and for-next
70 files changed, 18939 insertions, 16 deletions
diff --git a/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch new file mode 100644 index 0000000..9fb59df --- /dev/null +++ b/for-next/0001-bcache-improve-multithreaded-bch_btree_check.patch @@ -0,0 +1,140 @@ +From ead990f754571c9492943b437014abab6894955c Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 21 May 2022 13:08:58 +0800 +Subject: [PATCH 1/4] bcache: improve multithreaded bch_btree_check() + +Commit 8e7102273f59 ("bcache: make bch_btree_check() to be +multithreaded") makes bch_btree_check() to be much faster when checking +all btree nodes during cache device registration. But it isn't in ideal +shap yet, still can be improved. + +This patch does the following thing to improve current parallel btree +nodes check by multiple threads in bch_btree_check(), +- Add read lock to root node while checking all the btree nodes with + multiple threads. Although currently it is not mandatory but it is + good to have a read lock in code logic. +- Remove local variable 'char name[32]', and generate kernel thread name + string directly when calling kthread_run(). +- Allocate local variable "struct btree_check_state check_state" on the + stack and avoid unnecessary dynamic memory allocation for it. +- Increase check_state->started to count created kernel thread after it + succeeds to create. +- When wait for all checking kernel threads to finish, use wait_event() + to replace wait_event_interruptible(). + +With this change, the code is more clear, and some potential error +conditions are avoided. + +Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +--- + drivers/md/bcache/btree.c | 58 ++++++++++++++++++--------------------- + 1 file changed, 26 insertions(+), 32 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ad9f16689419..2362bb8ef6d1 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c) + int i; + struct bkey *k = NULL; + struct btree_iter iter; +- struct btree_check_state *check_state; +- char name[32]; ++ struct btree_check_state check_state; + + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) +@@ -2018,63 +2017,58 @@ int bch_btree_check(struct cache_set *c) + if (c->root->level == 0) + return 0; + +- check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); +- if (!check_state) +- return -ENOMEM; +- +- check_state->c = c; +- check_state->total_threads = bch_btree_chkthread_nr(); +- check_state->key_idx = 0; +- spin_lock_init(&check_state->idx_lock); +- atomic_set(&check_state->started, 0); +- atomic_set(&check_state->enough, 0); +- init_waitqueue_head(&check_state->wait); ++ check_state.c = c; ++ check_state.total_threads = bch_btree_chkthread_nr(); ++ check_state.key_idx = 0; ++ spin_lock_init(&check_state.idx_lock); ++ atomic_set(&check_state.started, 0); ++ atomic_set(&check_state.enough, 0); ++ init_waitqueue_head(&check_state.wait); + ++ rw_lock(0, c->root, c->root->level); + /* + * Run multiple threads to check btree nodes in parallel, +- * if check_state->enough is non-zero, it means current ++ * if check_state.enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ +- for (i = 0; i < check_state->total_threads; i++) { +- /* fetch latest check_state->enough earlier */ ++ for (i = 0; i < check_state.total_threads; i++) { ++ /* fetch latest check_state.enough earlier */ + smp_mb__before_atomic(); +- if (atomic_read(&check_state->enough)) ++ if (atomic_read(&check_state.enough)) + break; + +- check_state->infos[i].result = 0; +- check_state->infos[i].state = check_state; +- snprintf(name, sizeof(name), "bch_btrchk[%u]", i); +- atomic_inc(&check_state->started); ++ check_state.infos[i].result = 0; ++ check_state.infos[i].state = &check_state; + +- check_state->infos[i].thread = ++ check_state.infos[i].thread = + kthread_run(bch_btree_check_thread, +- &check_state->infos[i], +- name); +- if (IS_ERR(check_state->infos[i].thread)) { ++ &check_state.infos[i], ++ "bch_btrchk[%d]", i); ++ if (IS_ERR(check_state.infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]\n", i); + for (--i; i >= 0; i--) +- kthread_stop(check_state->infos[i].thread); ++ kthread_stop(check_state.infos[i].thread); + ret = -ENOMEM; + goto out; + } ++ atomic_inc(&check_state.started); + } + + /* + * Must wait for all threads to stop. + */ +- wait_event_interruptible(check_state->wait, +- atomic_read(&check_state->started) == 0); ++ wait_event(check_state.wait, atomic_read(&check_state.started) == 0); + +- for (i = 0; i < check_state->total_threads; i++) { +- if (check_state->infos[i].result) { +- ret = check_state->infos[i].result; ++ for (i = 0; i < check_state.total_threads; i++) { ++ if (check_state.infos[i].result) { ++ ret = check_state.infos[i].result; + goto out; + } + } + + out: +- kfree(check_state); ++ rw_unlock(0, c->root); + return ret; + } + +-- +2.35.3 + diff --git a/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch new file mode 100644 index 0000000..2a05768 --- /dev/null +++ b/for-next/0002-bcache-improve-multithreaded-bch_sectors_dirty_init.patch @@ -0,0 +1,132 @@ +From 7ff9ba24404e797a53fd44ae4c21b2234d46ca39 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 21 May 2022 14:14:17 +0800 +Subject: [PATCH 2/4] bcache: improve multithreaded bch_sectors_dirty_init() + +Commit b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be +multithreaded") makes bch_sectors_dirty_init() to be much faster +when counting dirty sectors by iterating all dirty keys in the btree. +But it isn't in ideal shape yet, still can be improved. + +This patch does the following changes to improve current parallel dirty +keys iteration on the btree, +- Add read lock to root node when multiple threads iterating the btree, + to prevent the root node gets split by I/Os from other registered + bcache devices. +- Remove local variable "char name[32]" and generate kernel thread name + string directly when calling kthread_run(). +- Allocate "struct bch_dirty_init_state state" directly on stack and + avoid the unnecessary dynamic memory allocation for it. +- Increase &state->started to count created kernel thread after it + succeeds to create. +- When wait for all dirty key counting threads to finish, use + wait_event() to replace wait_event_interruptible(). + +With the above changes, the code is more clear, and some potential error +conditions are avoided. + +Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +--- + drivers/md/bcache/writeback.c | 62 ++++++++++++++--------------------- + 1 file changed, 25 insertions(+), 37 deletions(-) + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 9ee0005874cd..d24c09490f8e 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -948,10 +948,10 @@ void bch_sectors_dirty_init(struct bcache_device *d) + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; +- struct bch_dirty_init_state *state; +- char name[32]; ++ struct bch_dirty_init_state state; + + /* Just count root keys if no leaf node */ ++ rw_lock(0, c->root, c->root->level); + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; +@@ -961,54 +961,42 @@ void bch_sectors_dirty_init(struct bcache_device *d) + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); ++ rw_unlock(0, c->root); + return; + } + +- state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); +- if (!state) { +- pr_warn("sectors dirty init failed: cannot allocate memory\n"); +- return; +- } +- +- state->c = c; +- state->d = d; +- state->total_threads = bch_btre_dirty_init_thread_nr(); +- state->key_idx = 0; +- spin_lock_init(&state->idx_lock); +- atomic_set(&state->started, 0); +- atomic_set(&state->enough, 0); +- init_waitqueue_head(&state->wait); +- +- for (i = 0; i < state->total_threads; i++) { +- /* Fetch latest state->enough earlier */ ++ state.c = c; ++ state.d = d; ++ state.total_threads = bch_btre_dirty_init_thread_nr(); ++ state.key_idx = 0; ++ spin_lock_init(&state.idx_lock); ++ atomic_set(&state.started, 0); ++ atomic_set(&state.enough, 0); ++ init_waitqueue_head(&state.wait); ++ ++ for (i = 0; i < state.total_threads; i++) { ++ /* Fetch latest state.enough earlier */ + smp_mb__before_atomic(); +- if (atomic_read(&state->enough)) ++ if (atomic_read(&state.enough)) + break; + +- state->infos[i].state = state; +- atomic_inc(&state->started); +- snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); +- +- state->infos[i].thread = +- kthread_run(bch_dirty_init_thread, +- &state->infos[i], +- name); +- if (IS_ERR(state->infos[i].thread)) { ++ state.infos[i].state = &state; ++ state.infos[i].thread = ++ kthread_run(bch_dirty_init_thread, &state.infos[i], ++ "bch_dirtcnt[%d]", i); ++ if (IS_ERR(state.infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]\n", i); + for (--i; i >= 0; i--) +- kthread_stop(state->infos[i].thread); ++ kthread_stop(state.infos[i].thread); + goto out; + } ++ atomic_inc(&state.started); + } + +- /* +- * Must wait for all threads to stop. +- */ +- wait_event_interruptible(state->wait, +- atomic_read(&state->started) == 0); +- + out: +- kfree(state); ++ /* Must wait for all threads to stop. */ ++ wait_event(state.wait, atomic_read(&state.started) == 0); ++ rw_unlock(0, c->root); + } + + void bch_cached_dev_writeback_init(struct cached_dev *dc) +-- +2.35.3 + diff --git a/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch new file mode 100644 index 0000000..b11b7d4 --- /dev/null +++ b/for-next/0003-bcache-remove-incremental-dirty-sector-counting-for-.patch @@ -0,0 +1,138 @@ +From 8ffcbccd25f7f3edd157e9e2aa78e9b158bebb9b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 21 May 2022 14:46:03 +0800 +Subject: [PATCH 3/4] bcache: remove incremental dirty sector counting for + bch_sectors_dirty_init() + +After making bch_sectors_dirty_init() being multithreaded, the existing +incremental dirty sector counting in bch_root_node_dirty_init() doesn't +release btree occupation after iterating 500000 (INIT_KEYS_EACH_TIME) +bkeys. Because a read lock is added on btree root node to prevent the +btree to be split during the dirty sectors counting, other I/O requester +has no chance to gain the write lock even restart bcache_btree(). + +That is to say, the incremental dirty sectors counting is incompatible +to the multhreaded bch_sectors_dirty_init(). We have to choose one and +drop another one. + +In my testing, with 512 bytes random writes, I generate 1.2T dirty data +and a btree with 400K nodes. With single thread and incremental dirty +sectors counting, it takes 30+ minites to register the backing device. +And with multithreaded dirty sectors counting, the backing device +registration can be accomplished within 2 minutes. + +The 30+ minutes V.S. 2- minutes difference makes me decide to keep +multithreaded bch_sectors_dirty_init() and drop the incremental dirty +sectors counting. This is what this patch does. + +But INIT_KEYS_EACH_TIME is kept, in sectors_dirty_init_fn() the CPU +will be released by cond_resched() after every INIT_KEYS_EACH_TIME keys +iterated. This is to avoid the watchdog reports a bogus soft lockup +warning. + +Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +--- + drivers/md/bcache/writeback.c | 41 +++++++++++------------------------ + 1 file changed, 13 insertions(+), 28 deletions(-) + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index d24c09490f8e..75b71199800d 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -805,13 +805,11 @@ static int bch_writeback_thread(void *arg) + + /* Init */ + #define INIT_KEYS_EACH_TIME 500000 +-#define INIT_KEYS_SLEEP_MS 100 + + struct sectors_dirty_init { + struct btree_op op; + unsigned int inode; + size_t count; +- struct bkey start; + }; + + static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, +@@ -827,11 +825,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + KEY_START(k), KEY_SIZE(k)); + + op->count++; +- if (atomic_read(&b->c->search_inflight) && +- !(op->count % INIT_KEYS_EACH_TIME)) { +- bkey_copy_key(&op->start, k); +- return -EAGAIN; +- } ++ if (!(op->count % INIT_KEYS_EACH_TIME)) ++ cond_resched(); + + return MAP_CONTINUE; + } +@@ -846,24 +841,16 @@ static int bch_root_node_dirty_init(struct cache_set *c, + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; +- op.start = KEY(op.inode, 0, 0); +- +- do { +- ret = bcache_btree(map_keys_recurse, +- k, +- c->root, +- &op.op, +- &op.start, +- sectors_dirty_init_fn, +- 0); +- if (ret == -EAGAIN) +- schedule_timeout_interruptible( +- msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); +- else if (ret < 0) { +- pr_warn("sectors dirty init failed, ret=%d!\n", ret); +- break; +- } +- } while (ret == -EAGAIN); ++ ++ ret = bcache_btree(map_keys_recurse, ++ k, ++ c->root, ++ &op.op, ++ &KEY(op.inode, 0, 0), ++ sectors_dirty_init_fn, ++ 0); ++ if (ret < 0) ++ pr_warn("sectors dirty init failed, ret=%d!\n", ret); + + return ret; + } +@@ -907,7 +894,6 @@ static int bch_dirty_init_thread(void *arg) + goto out; + } + skip_nr--; +- cond_resched(); + } + + if (p) { +@@ -917,7 +903,6 @@ static int bch_dirty_init_thread(void *arg) + + p = NULL; + prev_idx = cur_idx; +- cond_resched(); + } + + out: +@@ -956,11 +941,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; +- op.start = KEY(op.inode, 0, 0); + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); ++ + rw_unlock(0, c->root); + return; + } +-- +2.35.3 + diff --git a/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch new file mode 100644 index 0000000..aabe732 --- /dev/null +++ b/for-next/0004-bcache-avoid-journal-no-space-deadlock-by-reserving-.patch @@ -0,0 +1,148 @@ +From 27029e1e8f064bc8541308c807d3ee579d86811d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 21 May 2022 22:55:46 +0800 +Subject: [PATCH 4/4] bcache: avoid journal no-space deadlock by reserving 1 + journal bucket + +The journal no-space deadlock was reported time to time. Such deadlock +can happen in the following situation. + +When all journal buckets are fully filled by active jset with heavy +write I/O load, the cache set registration (after a reboot) will load +all active jsets and inserting them into the btree again (which is +called journal replay). If a journaled bkey is inserted into a btree +node and results btree node split, new journal request might be +triggered. For example, the btree grows one more level after the node +split, then the root node record in cache device super block will be +upgrade by bch_journal_meta() from bch_btree_set_root(). But there is no +space in journal buckets, the journal replay has to wait for new journal +bucket to be reclaimed after at least one journal bucket replayed. This +is one example that how the journal no-space deadlock happens. + +The solution to avoid the deadlock is to reserve 1 journal bucket in +run time, and only permit the reserved journal bucket to be used during +cache set registration procedure for things like journal replay. Then +the journal space will never be fully filled, there is no chance for +journal no-space deadlock to happen anymore. + +This patch adds a new member "bool do_reserve" in struct journal, it is +inititalized to 0 (false) when struct journal is allocated, and set to +1 (true) by bch_journal_space_reserve() when all initialization done in +run_cache_set(). In the run time when journal_reclaim() tries to +allocate a new journal bucket, free_journal_buckets() is called to check +whether there are enough free journal buckets to use. If there is only +1 free journal bucket and journal->do_reserve is 1 (true), the last +bucket is reserved and free_journal_buckets() will return 0 to indicate +no free journal bucket. Then journal_reclaim() will give up, and try +next time to see whetheer there is free journal bucket to allocate. By +this method, there is always 1 jouranl bucket reserved in run time. + +During the cache set registration, journal->do_reserve is 0 (false), so +the reserved journal bucket can be used to avoid the no-space deadlock. + +Reported-by: Nikhil Kshirsagar <nkshirsagar@gmail.com> +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +--- + drivers/md/bcache/journal.c | 31 ++++++++++++++++++++++++++----- + drivers/md/bcache/journal.h | 2 ++ + drivers/md/bcache/super.c | 1 + + 3 files changed, 29 insertions(+), 5 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index df5347ea450b..e5da469a4235 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -405,6 +405,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + return ret; + } + ++void bch_journal_space_reserve(struct journal *j) ++{ ++ j->do_reserve = true; ++} ++ + /* Journalling */ + + static void btree_flush_write(struct cache_set *c) +@@ -621,12 +626,30 @@ static void do_journal_discard(struct cache *ca) + } + } + ++static unsigned int free_journal_buckets(struct cache_set *c) ++{ ++ struct journal *j = &c->journal; ++ struct cache *ca = c->cache; ++ struct journal_device *ja = &c->cache->journal; ++ unsigned int n; ++ ++ /* In case njournal_buckets is not power of 2 */ ++ if (ja->cur_idx >= ja->discard_idx) ++ n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; ++ else ++ n = ja->discard_idx - ja->cur_idx; ++ ++ if (n > (1 + j->do_reserve)) ++ return n - (1 + j->do_reserve); ++ ++ return 0; ++} ++ + static void journal_reclaim(struct cache_set *c) + { + struct bkey *k = &c->journal.key; + struct cache *ca = c->cache; + uint64_t last_seq; +- unsigned int next; + struct journal_device *ja = &ca->journal; + atomic_t p __maybe_unused; + +@@ -649,12 +672,10 @@ static void journal_reclaim(struct cache_set *c) + if (c->journal.blocks_free) + goto out; + +- next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; +- /* No space available on this device */ +- if (next == ja->discard_idx) ++ if (!free_journal_buckets(c)) + goto out; + +- ja->cur_idx = next; ++ ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..cd316b4a1e95 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -105,6 +105,7 @@ struct journal { + spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; ++ bool do_reserve; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; +@@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list); + + void bch_journal_free(struct cache_set *c); + int bch_journal_alloc(struct cache_set *c); ++void bch_journal_space_reserve(struct journal *j); + + #endif /* _BCACHE_JOURNAL_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index bf3de149d3c9..2bb55278d22d 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2128,6 +2128,7 @@ static int run_cache_set(struct cache_set *c) + + flash_devs_run(c); + ++ bch_journal_space_reserve(&c->journal); + set_bit(CACHE_SET_RUNNING, &c->flags); + return 0; + err: +-- +2.35.3 + diff --git a/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch new file mode 100644 index 0000000..fba652d --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0001-bcache-add-initial-data-structures-for-nvm-pages.patch @@ -0,0 +1,343 @@ +From d5ca176bc66727740baa4c80ba1349ba25dc95f7 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 26 Jul 2021 00:26:28 +0800 +Subject: [PATCH 01/13] bcache: add initial data structures for nvm pages + +This patch initializes the prototype data structures for nvm pages +allocator, + +- struct bch_nvmpg_sb + This is the super block allocated on each nvdimm namespace for the nvm +pages allocator. A nvdimm pages allocator set may have multiple name- +spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this +namespace belongs to. + +- struct bch_nvmpg_header + This is a table for all heads of all allocation record lists. An allo- +cation record list traces all page(s) allocated from nvdimm namespace(s) +to a specific requester (identified by uuid). After system reboot, a +requester can retrieve all previously allocated nvdimm pages from its +record list by a pre-defined uuid. + +- struct bch_nvmpg_head + This is a head of an allocation record list. Each nvdimm pages +requester (typically it's a driver) has and only has one allocation +record list, and an allocated nvdimm page only belongs to a specific +allocation record list. Member uuid[] will be set as the requester's +uuid, e.g. for bcache it is the cache set uuid. Member label is not +mandatory, it is a human-readable string for debug purpose. The nvm +offset format pointers recs_offset[] point to the location of actual +allocator record lists on each namespace of the nvdimm pages allocator +set. Each per namespace record list is represented by the following +struct bch_nvmpg_recs. + +- struct bch_nvmpg_recs + This structure represents a requester's allocation record list. Member +uuid is same value as the uuid of its corresponding struct +bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec +objects to trace all allocated nvmdimm pages. If the table recs[] is +full, the nvmpg format offset is a pointer points to the next struct +bch_nvmpg_recs object, nvm pages allocator will look for available free +allocation record there. All the linked struct bch_nvmpg_recs objects +compose a requester's alloction record list which is headed by the above +struct bch_nvmpg_head. + +- struct bch_nvmpg_recs + This structure records a range of allocated nvdimm pages. Member pgoff +is offset in unit of page size of this allocation range. Member order +indicates size of the allocation range by (1 << order) in unit of page +size. Because the nvdimm pages allocator set may have multiple nvdimm +namespaces, member ns_id is used to identify which namespace the pgoff +belongs to. + - Bits 0 - 51: pgoff - is pages offset of the allocated pages. + - Bits 52 - 57: order - allocaed size in page_size * order-of-2 + - Bits 58 - 60: ns_id - identify which namespace the pages stays on + - Bits 61 - 63: reserved. +Since each of the allocated nvm pages are power of 2, using 6 bits to +represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum +value. It can be a 76 bits width range size in byte for 4KB page size, +which is large enough currently. + +All the structure members having _offset suffix are in a special fomat. +E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset}, +bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset}, +the offset value is 64bit, the most significant 3 bits are used to +identify which namespace this offset belongs to, and the rested 61 bits +are actual offset inside the namespace. Following patches will have +helper routines to do the conversion between memory pointer and offset. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Ying Huang <ying.huang@intel.com> +--- + drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++ + 1 file changed, 253 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg_format.h + +diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h +new file mode 100644 +index 000000000000..e9eb6371fd78 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg_format.h +@@ -0,0 +1,253 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ ++#ifndef _NVMPG_FORMAT_H ++#define _NVMPG_FORMAT_H ++ ++/* ++ * Bcache on NVDIMM data structures ++ */ ++ ++/* ++ * - struct bch_nvmpg_sb ++ * This is the super block allocated on each nvdimm namespace for the nvm ++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces, ++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space ++ * belongs to. ++ * ++ * - struct bch_nvmpg_header ++ * This is a table for all heads of all allocation record lists. An allo- ++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to ++ * a specific requester (identified by uuid). After system reboot, a requester ++ * can retrieve all previously allocated nvdimm pages from its record list by a ++ * pre-defined uuid. ++ * ++ * - struct bch_nvmpg_head ++ * This is a head of an allocation record list. Each nvdimm pages requester ++ * (typically it's a driver) has and only has one allocation record list, and ++ * an allocated nvdimm page only bedlones to a specific allocation record list. ++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the ++ * cache set uuid. Member label is not mandatory, it is a human-readable string ++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the ++ * location of actual allocator record lists on each name space of the nvdimm ++ * pages allocator set. Each per name space record list is represented by the ++ * following struct bch_nvmpg_recs. ++ * ++ * - struct bch_nvmpg_recs ++ * This structure represents a requester's allocation record list. Member uuid ++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member ++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated ++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a ++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator ++ * will look for available free allocation record there. All the linked ++ * struct bch_nvmpg_recs objects compose a requester's alloction record list ++ * which is headed by the above struct bch_nvmpg_head. ++ * ++ * - struct bch_nvmpg_rec ++ * This structure records a range of allocated nvdimm pages. Member pgoff is ++ * offset in unit of page size of this allocation range. Member order indicates ++ * size of the allocation range by (1 << order) in unit of page size. Because ++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member ++ * ns_id is used to identify which name space the pgoff belongs to. ++ * ++ * All allocation record lists are stored on the first initialized nvdimm name- ++ * space (ns_id 0). The meta data default layout of nvm pages allocator on ++ * namespace 0 is, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET ++ * | bch_nvmpg_header | ++ * | | ++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET ++ * | bch_nvmpg_recs | ++ * | (nvm pages internal usage) | ++ * 24KB +---------------------------------+ ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * ++ * Meta data default layout on rested nvdimm namespaces, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * - The nvmpg offset format pointer ++ * All member names ending with _offset in this header are nvmpg offset ++ * format pointer. The offset format is, ++ * [highest 3 bits: ns_id] ++ * [rested 61 bits: offset in No. ns_id namespace] ++ * ++ * The above offset is byte unit, the procedure to reference a nvmpg offset ++ * format pointer is, ++ * 1) Identify the namespace related in-memory structure by ns_id from the ++ * highest 3 bits of offset value. ++ * 2) Get the DAX mapping base address from the in-memory structure. ++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base ++ * address with offset value in rested low 61 bits. ++ * All related in-memory structure and conversion routines don't belong to ++ * user space api, they are defined by nvm-pages allocator code in ++ * drivers/md/bcache/nvm-pages.{c,h} ++ * ++ */ ++ ++#include <linux/types.h> ++ ++/* In sectors */ ++#define BCH_NVMPG_SB_OFFSET 4096 ++#define BCH_NVMPG_START (16 << 20) ++ ++#define BCH_NVMPG_LBL_SIZE 32 ++#define BCH_NVMPG_NS_MAX 8 ++ ++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10) ++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10) ++ ++#define BCH_NVMPG_SB_VERSION 0 ++#define BCH_NVMPG_SB_VERSION_MAX 0 ++ ++static const __u8 bch_nvmpg_magic[] = { ++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83, ++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 }; ++static const __u8 bch_nvmpg_recs_magic[] = { ++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9, ++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae }; ++ ++/* takes 64bit width */ ++struct bch_nvmpg_rec { ++ union { ++ struct { ++ __u64 pgoff:52; ++ __u64 order:6; ++ __u64 ns_id:3; ++ __u64 reserved:3; ++ }; ++ __u64 _v; ++ }; ++}; ++ ++struct bch_nvmpg_recs { ++ union { ++ struct { ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvmpg_head ++ */ ++ __u64 head_offset; ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvm_pgalloc_recs which contains ++ * the next recs[] array. ++ */ ++ __u64 next_offset; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_rec recs[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_RECS \ ++ ((sizeof(struct bch_nvmpg_recs) - \ ++ offsetof(struct bch_nvmpg_recs, recs)) / \ ++ sizeof(struct bch_nvmpg_rec)) ++ ++#define BCH_NVMPG_HD_STAT_FREE 0x0 ++#define BCH_NVMPG_HD_STAT_ALLOC 0x1 ++struct bch_nvmpg_head { ++ __u8 uuid[16]; ++ __u8 label[BCH_NVMPG_LBL_SIZE]; ++ __u32 state; ++ __u32 flags; ++ /* ++ * Array of offset values from the nvmpg offset format ++ * pointers, each of the pointer points to a per-namespace ++ * struct bch_nvmpg_recs. ++ */ ++ __u64 recs_offset[BCH_NVMPG_NS_MAX]; ++}; ++ ++/* heads[0] is always for nvm_pages internal usage */ ++struct bch_nvmpg_set_header { ++ union { ++ struct { ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_head heads[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_HEADS \ ++ ((sizeof(struct bch_nvmpg_set_header) - \ ++ offsetof(struct bch_nvmpg_set_header, heads)) / \ ++ sizeof(struct bch_nvmpg_head)) ++ ++/* The on-media bit order is local CPU order */ ++struct bch_nvmpg_sb { ++ __u64 csum; ++ __u64 sb_offset; ++ __u64 ns_start; ++ __u64 version; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 page_size; ++ __u32 total_ns; ++ __u32 this_ns; ++ union { ++ __u8 set_uuid[16]; ++ __u64 set_magic; ++ }; ++ ++ __u64 flags; ++ __u64 seq; ++ ++ __u64 feature_compat; ++ __u64 feature_incompat; ++ __u64 feature_ro_compat; ++ ++ /* For allocable nvm pages from buddy systems */ ++ __u64 pages_offset; ++ __u64 pages_total; ++ ++ __u64 pad[8]; ++ ++ /* ++ * A nvmpg offset format pointer, it points ++ * to struct bch_nvmpg_set_header which is ++ * stored only on the first name space. ++ */ ++ __u64 set_header_offset; ++ ++ /* Just for csum_set() */ ++ __u32 keys; ++ __u64 d[0]; ++}; ++ ++#endif /* _NVMPG_FORMAT_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch new file mode 100644 index 0000000..485a6e0 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0002-bcache-initialize-the-nvm-pages-allocator.patch @@ -0,0 +1,543 @@ +From d0a096b054485476b6788ae2a071c036dcffc248 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 26 Jul 2021 10:33:30 +0800 +Subject: [PATCH 02/13] bcache: initialize the nvm pages allocator + +This patch define the prototype data structures in memory and +initializes the nvm pages allocator. + +The nvm address space which is managed by this allocator can consist of +many nvm namespaces, and some namespaces can compose into one nvm set, +like cache set. For this initial implementation, only one set can be +supported. + +The users of this nvm pages allocator need to call register_namespace() +to register the nvdimm device (like /dev/pmemX) into this allocator as +the instance of struct nvm_namespace. + +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/Kconfig | 10 ++ + drivers/md/bcache/Makefile | 1 + + drivers/md/bcache/nvmpg.c | 341 +++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 97 +++++++++++ + drivers/md/bcache/super.c | 3 + + 5 files changed, 452 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg.c + create mode 100644 drivers/md/bcache/nvmpg.h + +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..4a7c13e882bb 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. ++ ++config BCACHE_NVM_PAGES ++ bool "NVDIMM support for bcache (EXPERIMENTAL)" ++ depends on BCACHE ++ depends on 64BIT ++ depends on LIBNVDIMM ++ depends on DAX ++ help ++ Allocate/release NV-memory pages for bcache and provide allocated pages ++ for each requestor after system reboot. +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..276b33be5ad5 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o + bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o ++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +new file mode 100644 +index 000000000000..be006a91e8bb +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.c +@@ -0,0 +1,341 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Nvdimm page-buddy allocator ++ * ++ * Copyright (c) 2021, Intel Corporation. ++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>. ++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>. ++ */ ++ ++#include "bcache.h" ++#include "nvmpg.h" ++ ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/mutex.h> ++#include <linux/dax.h> ++#include <linux/pfn_t.h> ++#include <linux/libnvdimm.h> ++#include <linux/mm_types.h> ++#include <linux/err.h> ++#include <linux/pagemap.h> ++#include <linux/bitmap.h> ++#include <linux/blkdev.h> ++ ++struct bch_nvmpg_set *global_nvmpg_set; ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ int ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (offset == 0) ++ return NULL; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (ns) ++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset)); ++ ++ pr_err("Invalid ns_id %u\n", ns_id); ++ return NULL; ++} ++ ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = (unsigned long)(ptr - ns->base_addr); ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++static void release_ns_tbl(struct bch_nvmpg_set *set) ++{ ++ int i; ++ struct bch_nvmpg_ns *ns; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ ns = set->ns_tbl[i]; ++ if (ns) { ++ fs_put_dax(ns->dax_dev); ++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); ++ set->ns_tbl[i] = NULL; ++ set->attached_ns--; ++ kfree(ns); ++ } ++ } ++ ++ if (set->attached_ns) ++ pr_err("unexpected attached_ns: %u\n", set->attached_ns); ++} ++ ++static void release_nvmpg_set(struct bch_nvmpg_set *set) ++{ ++ release_ns_tbl(set); ++ kfree(set); ++} ++ ++/* Namespace 0 contains all meta data of the nvmpg allocation set */ ++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_set_header *set_header; ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n", ++ ns->ns_id); ++ return -EINVAL; ++ } ++ ++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset); ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ global_nvmpg_set->set_header = set_header; ++ global_nvmpg_set->heads_size = set_header->size; ++ global_nvmpg_set->heads_used = set_header->used; ++ mutex_unlock(&global_nvmpg_set->lock); ++ ++ return 0; ++} ++ ++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_sb *sb = ns->sb; ++ int rc = 0; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) { ++ pr_err("ns_id %u already attached.\n", ns->ns_id); ++ rc = -EEXIST; ++ goto unlock; ++ } ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ if (global_nvmpg_set->attached_ns > 0) { ++ pr_err("multiple namespace attaching not supported yet\n"); ++ rc = -EOPNOTSUPP; ++ goto unlock; ++ } ++ ++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) { ++ pr_err("namespace counters error: attached %u > total %u\n", ++ global_nvmpg_set->attached_ns, ++ global_nvmpg_set->total_ns); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16); ++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns; ++ global_nvmpg_set->attached_ns++; ++ global_nvmpg_set->total_ns = sb->total_ns; ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} ++ ++static int read_nvdimm_meta_super(struct block_device *bdev, ++ struct bch_nvmpg_ns *ns) ++{ ++ struct page *page; ++ struct bch_nvmpg_sb *sb; ++ uint64_t expected_csum = 0; ++ int r; ++ ++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, ++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); ++ ++ if (IS_ERR(page)) ++ return -EIO; ++ ++ sb = (struct bch_nvmpg_sb *) ++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET)); ++ ++ r = -EINVAL; ++ expected_csum = csum_set(sb); ++ if (expected_csum != sb->csum) { ++ pr_info("csum is not match with expected one\n"); ++ goto put_page; ++ } ++ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_info("invalid bch_nvmpg_magic\n"); ++ goto put_page; ++ } ++ ++ if (sb->sb_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) { ++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset); ++ goto put_page; ++ } ++ ++ r = -EOPNOTSUPP; ++ if (sb->total_ns != 1) { ++ pr_info("multiple name space not supported yet.\n"); ++ goto put_page; ++ } ++ ++ ++ r = 0; ++ /* Necessary for DAX mapping */ ++ ns->page_size = sb->page_size; ++ ns->pages_total = sb->pages_total; ++ ++put_page: ++ put_page(page); ++ return r; ++} ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ struct bch_nvmpg_ns *ns = NULL; ++ struct bch_nvmpg_sb *sb = NULL; ++ char buf[BDEVNAME_SIZE]; ++ struct block_device *bdev; ++ pgoff_t pgoff; ++ int id, err; ++ char *path; ++ long dax_ret = 0; ++ ++ path = kstrndup(dev_path, 512, GFP_KERNEL); ++ if (!path) { ++ pr_err("kstrndup failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ bdev = blkdev_get_by_path(strim(path), ++ FMODE_READ|FMODE_WRITE|FMODE_EXEC, ++ global_nvmpg_set); ++ if (IS_ERR(bdev)) { ++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev)); ++ kfree(path); ++ return ERR_PTR(PTR_ERR(bdev)); ++ } ++ ++ err = -ENOMEM; ++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL); ++ if (!ns) ++ goto bdput; ++ ++ err = -EIO; ++ if (read_nvdimm_meta_super(bdev, ns)) { ++ pr_err("%s read nvdimm meta super block failed.\n", ++ bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EOPNOTSUPP; ++ ns->dax_dev = fs_dax_get_by_bdev(bdev); ++ if (!ns->dax_dev) { ++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0, ++ bdev_nr_sectors(bdev))) { ++ pr_err("%s don't support DAX\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) { ++ pr_err("invalid offset of %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ id = dax_read_lock(); ++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total, ++ &ns->base_addr, &ns->start_pfn); ++ if (dax_ret <= 0) { ++ pr_err("dax_direct_access error\n"); ++ dax_read_unlock(id); ++ goto free_ns; ++ } ++ ++ if (dax_ret < ns->pages_total) { ++ pr_warn("mapped range %ld is less than ns->pages_total %lu\n", ++ dax_ret, ns->pages_total); ++ } ++ dax_read_unlock(id); ++ ++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET); ++ ++ err = -EINVAL; ++ /* Check magic again to make sure DAX mapping is correct */ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n"); ++ goto free_ns; ++ } ++ ++ if ((global_nvmpg_set->attached_ns > 0) && ++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) { ++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id); ++ goto free_ns; ++ } ++ ++ if (sb->set_header_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) { ++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n", ++ sb->this_ns, ++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset), ++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset)); ++ goto free_ns; ++ } ++ ++ ns->page_size = sb->page_size; ++ ns->pages_offset = sb->pages_offset; ++ ns->pages_total = sb->pages_total; ++ ns->sb = sb; ++ ns->free = 0; ++ ns->bdev = bdev; ++ ns->set = global_nvmpg_set; ++ ++ err = attach_nvmpg_set(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ mutex_init(&ns->lock); ++ ++ err = init_nvmpg_set_header(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ kfree(path); ++ return ns; ++ ++free_ns: ++ fs_put_dax(ns->dax_dev); ++ kfree(ns); ++bdput: ++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); ++ kfree(path); ++ return ERR_PTR(err); ++} ++EXPORT_SYMBOL_GPL(bch_register_namespace); ++ ++int __init bch_nvmpg_init(void) ++{ ++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL); ++ if (!global_nvmpg_set) ++ return -ENOMEM; ++ ++ global_nvmpg_set->total_ns = 0; ++ mutex_init(&global_nvmpg_set->lock); ++ ++ pr_info("bcache nvm init\n"); ++ return 0; ++} ++ ++void bch_nvmpg_exit(void) ++{ ++ release_nvmpg_set(global_nvmpg_set); ++ pr_info("bcache nvm exit\n"); ++} +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +new file mode 100644 +index 000000000000..698c890b2d15 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _BCACHE_NVM_PAGES_H ++#define _BCACHE_NVM_PAGES_H ++ ++#include <linux/libnvdimm.h> ++ ++#include "nvmpg_format.h" ++ ++/* ++ * Bcache NVDIMM in memory data structures ++ */ ++ ++/* ++ * The following three structures in memory records which page(s) allocated ++ * to which owner. After reboot from power failure, they will be initialized ++ * based on nvm pages superblock in NVDIMM device. ++ */ ++struct bch_nvmpg_ns { ++ struct bch_nvmpg_sb *sb; ++ void *base_addr; ++ ++ unsigned char uuid[16]; ++ int ns_id; ++ unsigned int page_size; ++ unsigned long free; ++ unsigned long pages_offset; ++ unsigned long pages_total; ++ pfn_t start_pfn; ++ ++ struct dax_device *dax_dev; ++ struct block_device *bdev; ++ struct bch_nvmpg_set *set; ++ ++ struct mutex lock; ++}; ++ ++/* ++ * A set of namespaces. Currently only one set can be supported. ++ */ ++struct bch_nvmpg_set { ++ unsigned char set_uuid[16]; ++ ++ int heads_size; ++ int heads_used; ++ struct bch_nvmpg_set_header *set_header; ++ ++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX]; ++ int total_ns; ++ int attached_ns; ++ ++ struct mutex lock; ++}; ++ ++#define BCH_NVMPG_NS_ID_BITS 3 ++#define BCH_NVMPG_OFFSET_BITS 61 ++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1) ++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1) ++ ++#define BCH_NVMPG_GET_NS_ID(offset) \ ++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK) ++ ++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK) ++ ++#define BCH_NVMPG_OFFSET(ns_id, offset) \ ++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \ ++ ((offset) & BCH_NVMPG_OFFSET_MASK)) ++ ++/* Indicate which field in bch_nvmpg_sb to be updated */ ++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); ++int bch_nvmpg_init(void); ++void bch_nvmpg_exit(void); ++ ++#else ++ ++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ return NULL; ++} ++ ++static inline int bch_nvmpg_init(void) ++{ ++ return 0; ++} ++ ++static inline void bch_nvmpg_exit(void) { } ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++#endif /* _BCACHE_NVM_PAGES_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index dc35f6e1d8d3..841d08e50191 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -14,6 +14,7 @@ + #include "request.h" + #include "writeback.h" + #include "features.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/pagemap.h> +@@ -2811,6 +2812,7 @@ static void bcache_exit(void) + { + bch_debug_exit(); + bch_request_exit(); ++ bch_nvmpg_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) +@@ -2909,6 +2911,7 @@ static int __init bcache_init(void) + + bch_debug_init(); + closure_debug_init(); ++ bch_nvmpg_init(); + + bcache_is_reboot = false; + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch new file mode 100644 index 0000000..395f285 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0003-bcache-initialization-of-the-buddy.patch @@ -0,0 +1,359 @@ +From c9977c3fd9e238ac5a8a684de5a8dc5c8a4462e2 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:45:57 +0800 +Subject: [PATCH 03/13] bcache: initialization of the buddy + +This nvm pages allocator will implement the simple buddy allocator to +anage the nvm address space. This patch initializes this buddy allocator +for new namespace. + +the unit of alloc/free of the buddy allocator is page. DAX device has +their struct page(in dram or PMEM). + + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + void *zone_device_data; + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ + }; + +ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use. +So the second/third word will be used as 'struct list_head ' which list +in buddy. The fourth word(that is normal struct page::index) store pgoff +which the page-offset in the dax device. And the fifth word (that is +normal struct page::private) store order of buddy. page_type will be used +to store buddy flags. + +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/nvmpg.h | 12 +++ + 2 files changed, 221 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index be006a91e8bb..b51073588f65 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + return BCH_NVMPG_OFFSET(ns_id, offset); + } + ++static struct page *bch_nvmpg_va_to_pg(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) ++{ ++ return ns->base_addr + (pgoff << PAGE_SHIFT); ++} ++ ++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) ++{ ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ pgoff_t pgoff = r->pgoff; ++ ++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++} ++ ++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff, u64 nr) ++{ ++ while (nr > 0) { ++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr; ++ ++ bitmap_set(ns->pages_bitmap, pgoff, num); ++ nr -= num; ++ pgoff += num; ++ } ++} ++ + static void release_ns_tbl(struct bch_nvmpg_set *set) + { + int i; +@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set) + for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { + ns = set->ns_tbl[i]; + if (ns) { ++ kvfree(ns->pages_bitmap); ++ if (ns->recs_bitmap) ++ bitmap_free(ns->recs_bitmap); ++ + fs_put_dax(ns->dax_dev); + blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); + set->ns_tbl[i] = NULL; +@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set) + kfree(set); + } + ++static int validate_recs(int ns_id, ++ struct bch_nvmpg_head *head, ++ struct bch_nvmpg_recs *recs) ++{ ++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) { ++ pr_err("Invalid bch_nvmpg_recs magic\n"); ++ return -EINVAL; ++ } ++ ++ if (memcmp(recs->uuid, head->uuid, 16)) { ++ pr_err("Invalid bch_nvmpg_recs uuid\n"); ++ return -EINVAL; ++ } ++ ++ if (recs->head_offset != ++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) { ++ pr_err("Invalid recs head_offset\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs) ++{ ++ int i, used = 0; ++ ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *r = &recs->recs[i]; ++ struct bch_nvmpg_ns *ns; ++ struct page *page; ++ void *addr; ++ ++ if (r->pgoff == 0) ++ continue; ++ ++ ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ addr = bch_nvmpg_rec_to_ptr(r); ++ if (addr < ns->base_addr) { ++ pr_err("Invalid recorded address\n"); ++ return -EINVAL; ++ } ++ ++ /* init struct page: index/private */ ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, r->order); ++ page->index = r->pgoff; ++ ++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order); ++ used++; ++ } ++ ++ if (used != recs->used) { ++ pr_err("used %d doesn't match recs->used %d\n", ++ used, recs->used); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ + /* Namespace 0 contains all meta data of the nvmpg allocation set */ + static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_set_header *set_header; ++ struct bch_nvmpg_recs *sys_recs; ++ int i, j, used = 0, rc = 0; + + if (ns->ns_id != 0) { + pr_err("unexpected ns_id %u for first nvmpg namespace.\n", +@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + global_nvmpg_set->set_header = set_header; + global_nvmpg_set->heads_size = set_header->size; + global_nvmpg_set->heads_used = set_header->used; ++ ++ /* Reserve the used space from buddy allocator */ ++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size)); ++ ++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET; ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *head; ++ ++ head = &set_header->heads[i]; ++ if (head->state == BCH_NVMPG_HD_STAT_FREE) ++ continue; ++ ++ used++; ++ if (used > global_nvmpg_set->heads_size) { ++ pr_err("used heads %d > heads size %d.\n", ++ used, global_nvmpg_set->heads_size); ++ goto unlock; ++ } ++ ++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]); ++ ++ /* Iterate the recs list */ ++ while (recs) { ++ rc = validate_recs(j, head, recs); ++ if (rc < 0) ++ goto unlock; ++ ++ rc = reserve_nvmpg_recs(recs); ++ if (rc < 0) ++ goto unlock; ++ ++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1); ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ } ++ } ++unlock: + mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} + +- return 0; ++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) ++{ ++ unsigned int start, end, pages; ++ int i; ++ struct page *page; ++ pgoff_t pgoff_start; ++ ++ bitmap_for_each_clear_region(ns->pages_bitmap, ++ start, end, 0, ns->pages_total) { ++ pgoff_start = start; ++ pages = end - start; ++ ++ while (pages) { ++ void *addr; ++ ++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { ++ if ((pgoff_start % (1L << i) == 0) && ++ (pages >= (1L << i))) ++ break; ++ } ++ ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, i); ++ page->index = pgoff_start; ++ __SetPageBuddy(page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[i]); ++ ++ pgoff_start += 1L << i; ++ pages -= 1L << i; ++ } ++ } + } + + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) +@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + char buf[BDEVNAME_SIZE]; + struct block_device *bdev; + pgoff_t pgoff; +- int id, err; ++ int id, i, err; + char *path; + long dax_ret = 0; + +@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + + mutex_init(&ns->lock); + ++ /* ++ * parameters of bitmap_set/clear are unsigned int. ++ * Given currently size of nvm is far from exceeding this limit, ++ * so only add a WARN_ON message. ++ */ ++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX); ++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!ns->pages_bitmap) { ++ err = -ENOMEM; ++ goto clear_ns_nr; ++ } ++ ++ if (ns->sb->this_ns == 0) { ++ ns->recs_bitmap = ++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL); ++ if (ns->recs_bitmap == NULL) { ++ err = -ENOMEM; ++ goto free_pages_bitmap; ++ } ++ } ++ ++ for (i = 0; i < BCH_MAX_ORDER; i++) ++ INIT_LIST_HEAD(&ns->free_area[i]); ++ + err = init_nvmpg_set_header(ns); + if (err < 0) +- goto free_ns; ++ goto free_recs_bitmap; ++ ++ if (ns->sb->this_ns == 0) ++ /* init buddy allocator */ ++ bch_nvmpg_init_free_space(ns); + + kfree(path); + return ns; + ++free_recs_bitmap: ++ bitmap_free(ns->recs_bitmap); ++free_pages_bitmap: ++ kvfree(ns->pages_bitmap); ++clear_ns_nr: ++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL; + free_ns: + fs_put_dax(ns->dax_dev); + kfree(ns); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 698c890b2d15..55778d4db7da 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -11,6 +11,8 @@ + * Bcache NVDIMM in memory data structures + */ + ++#define BCH_MAX_ORDER 20 ++ + /* + * The following three structures in memory records which page(s) allocated + * to which owner. After reboot from power failure, they will be initialized +@@ -28,6 +30,11 @@ struct bch_nvmpg_ns { + unsigned long pages_total; + pfn_t start_pfn; + ++ unsigned long *pages_bitmap; ++ struct list_head free_area[BCH_MAX_ORDER]; ++ ++ unsigned long *recs_bitmap; ++ + struct dax_device *dax_dev; + struct block_device *bdev; + struct bch_nvmpg_set *set; +@@ -69,6 +76,11 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_MAX_PGALLOC_RECS \ ++ (min_t(unsigned int, 64, \ ++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ ++ sizeof(struct bch_nvmpg_recs))) ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset); + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch new file mode 100644 index 0000000..9667099 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch @@ -0,0 +1,309 @@ +From 8d0370253021430d3e59b084ce242a32410a51c0 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Wed, 4 Aug 2021 22:41:20 +0800 +Subject: [PATCH 04/13] bcache: bch_nvmpg_alloc_pages() of the buddy + +This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy +allocator. In terms of function, this func is like current +page-buddy-alloc. But the differences are: +a: it need owner_uuid as parameter which record owner info. And it +make those info persistence. +b: it don't need flags like GFP_*. All allocs are the equal. +c: it don't trigger other ops etc swap/recycle. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 222 ++++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 9 ++ + 2 files changed, 231 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index b51073588f65..8c0e827a98cd 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset) + return NULL; + } + ++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset) ++{ ++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT; ++} ++ + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + { + int ns_id = ns->ns_id; +@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) + return ns->base_addr + (pgoff << PAGE_SHIFT); + } + ++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = pgoff << PAGE_SHIFT; ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ + static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + { + struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; +@@ -269,6 +283,214 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + } + } + ++ ++/* If not found, it will create if create == true */ ++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create) ++{ ++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header; ++ struct bch_nvmpg_head *head = NULL; ++ int i; ++ ++ if (set_header == NULL) ++ goto out; ++ ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *h = &set_header->heads[i]; ++ ++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC) ++ continue; ++ ++ if (!memcmp(uuid, h->uuid, 16)) { ++ head = h; ++ break; ++ } ++ } ++ ++ if (!head && create) { ++ u32 used = set_header->used; ++ ++ if (set_header->size > used) { ++ head = &set_header->heads[used]; ++ memset(head, 0, sizeof(struct bch_nvmpg_head)); ++ head->state = BCH_NVMPG_HD_STAT_ALLOC; ++ memcpy(head->uuid, uuid, 16); ++ global_nvmpg_set->heads_used++; ++ set_header->used++; ++ } else ++ pr_info("No free bch_nvmpg_head\n"); ++ } ++ ++out: ++ return head; ++} ++ ++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void) ++{ ++ unsigned int start; ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0]; ++ struct bch_nvmpg_recs *recs; ++ ++ start = bitmap_find_next_zero_area(ns->recs_bitmap, ++ BCH_MAX_PGALLOC_RECS, 0, 1, 0); ++ if (start > BCH_MAX_PGALLOC_RECS) { ++ pr_info("No free struct bch_nvmpg_recs\n"); ++ return NULL; ++ } ++ ++ bitmap_set(ns->recs_bitmap, start, 1); ++ recs = (struct bch_nvmpg_recs *) ++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET) ++ + start; ++ ++ memset(recs, 0, sizeof(struct bch_nvmpg_recs)); ++ return recs; ++} ++ ++ ++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_head *head, ++ bool create) ++{ ++ int ns_id = ns->sb->this_ns; ++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]); ++ ++ /* If create=false, we return recs[nr] */ ++ if (!create) ++ return recs; ++ ++ /* ++ * If create=true, it mean we need a empty struct bch_nvmpg_rec ++ * So we should find non-empty struct bch_nvmpg_recs or alloc ++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs ++ */ ++ while (recs && (recs->used == recs->size)) { ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ ++ /* Found empty struct bch_nvmpg_recs */ ++ if (recs) ++ return recs; ++ ++ /* Need alloc new struct bch_nvmpg_recs */ ++ recs = find_empty_nvmpg_recs(); ++ if (recs) { ++ unsigned long offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head); ++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16); ++ memcpy(recs->uuid, head->uuid, 16); ++ recs->size = BCH_NVMPG_MAX_RECS; ++ recs->used = 0; ++ ++ offset = bch_nvmpg_ptr_to_offset(ns, recs); ++ if (prev_recs) ++ prev_recs->next_offset = offset; ++ else ++ head->recs_offset[ns_id] = offset; ++ } ++ ++ return recs; ++} ++ ++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_recs *recs, ++ unsigned long nvmpg_offset, ++ int order) ++{ ++ int i, ns_id; ++ unsigned long pgoff; ++ ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ns_id = ns->sb->this_ns; ++ ++ for (i = 0; i < recs->size; i++) { ++ if (recs->recs[i].pgoff == 0) { ++ recs->recs[i].pgoff = pgoff; ++ recs->recs[i].order = order; ++ recs->recs[i].ns_id = ns_id; ++ recs->used++; ++ break; ++ } ++ } ++ BUG_ON(i == recs->size); ++} ++ ++ ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ unsigned long nvmpg_offset = 0; ++ struct bch_nvmpg_head *head; ++ int n, o; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ head = find_nvmpg_head(uuid, true); ++ ++ if (!head) { ++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n"); ++ goto unlock; ++ } ++ ++ for (n = 0; n < global_nvmpg_set->total_ns; n++) { ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n]; ++ ++ if (!ns || (ns->free < (1L << order))) ++ continue; ++ ++ for (o = order; o < BCH_MAX_ORDER; o++) { ++ struct list_head *list; ++ struct page *page, *buddy_page; ++ ++ if (list_empty(&ns->free_area[o])) ++ continue; ++ ++ list = ns->free_area[o].next; ++ page = container_of((void *)list, struct page, ++ zone_device_data); ++ ++ list_del(list); ++ ++ while (o != order) { ++ void *addr; ++ pgoff_t pgoff; ++ ++ pgoff = page->index + (1L << (o - 1)); ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(buddy_page, o - 1); ++ buddy_page->index = pgoff; ++ __SetPageBuddy(buddy_page); ++ list_add((struct list_head *)&buddy_page->zone_device_data, ++ &ns->free_area[o - 1]); ++ o--; ++ } ++ ++ set_page_private(page, order); ++ __ClearPageBuddy(page); ++ ns->free -= 1L << order; ++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index); ++ break; ++ } ++ ++ if (o < BCH_MAX_ORDER) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = find_nvmpg_recs(ns, head, true); ++ /* ToDo: handle pgalloc_recs==NULL */ ++ add_nvmpg_rec(ns, recs, nvmpg_offset, order); ++ break; ++ } ++ } ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return nvmpg_offset; ++} ++EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 55778d4db7da..d03f3241b45a 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -76,6 +76,9 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_PGOFF_TO_KVADDR(pgoff) \ ++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT)) ++ + #define BCH_MAX_PGALLOC_RECS \ + (min_t(unsigned int, 64, \ + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ +@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + + #else + +@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void) + + static inline void bch_nvmpg_exit(void) { } + ++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ return 0; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch new file mode 100644 index 0000000..0f8454f --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch @@ -0,0 +1,252 @@ +From f0165caac63639c6bbc9bfa2182500ecebdb6bf9 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:06:35 +0800 +Subject: [PATCH 05/13] bcache: bch_nvmpg_free_pages() of the buddy allocator + +This patch implements the bch_nvmpg_free_pages() of the buddy allocator. + +The difference between this and page-buddy-free: +it need owner_uuid to free owner allocated pages, and must +persistent after free. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 165 ++++++++++++++++++++++++++++++++++++-- + drivers/md/bcache/nvmpg.h | 3 + + 2 files changed, 161 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 8c0e827a98cd..7b86f08c219a 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + return rc; + } + ++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset, ++ int order) ++{ ++ unsigned long add_pages = (1L << order); ++ pgoff_t pgoff; ++ struct page *page; ++ void *va; ++ ++ if (nvmpg_offset == 0) { ++ pr_err("free pages on offset 0\n"); ++ return; ++ } ++ ++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset)); ++ WARN_ON((!page) || (page->private != order)); ++ pgoff = page->index; ++ ++ while (order < BCH_MAX_ORDER - 1) { ++ struct page *buddy_page; ++ ++ pgoff_t buddy_pgoff = pgoff ^ (1L << order); ++ pgoff_t parent_pgoff = pgoff & ~(1L << order); ++ ++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total)) ++ break; ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!buddy_page); ++ ++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) { ++ list_del((struct list_head *)&buddy_page->zone_device_data); ++ __ClearPageBuddy(buddy_page); ++ pgoff = parent_pgoff; ++ order++; ++ continue; ++ } ++ break; ++ } ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[order]); ++ page->index = pgoff; ++ set_page_private(page, order); ++ __SetPageBuddy(page); ++ ns->free += add_pages; ++} ++ + static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + { + unsigned int start, end, pages; +@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + pages = end - start; + + while (pages) { +- void *addr; +- + for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { + if ((pgoff_start % (1L << i) == 0) && + (pages >= (1L << i))) + break; + } + +- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); +- page = bch_nvmpg_va_to_pg(addr); ++ page = bch_nvmpg_va_to_pg( ++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start)); + set_page_private(page, i); + page->index = pgoff_start; +- __SetPageBuddy(page); +- list_add((struct list_head *)&page->zone_device_data, +- &ns->free_area[i]); ++ ++ /* In order to update ns->free */ ++ __free_space(ns, pgoff_start, i); + + pgoff_start += 1L << i; + pages -= 1L << i; +@@ -491,6 +540,107 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + } + EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); + ++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) ++{ ++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT); ++} ++ ++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns, ++ void *start_addr, void *end_addr) ++{ ++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns)); ++} ++ ++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id, ++ unsigned long nvmpg_offset, int order) ++{ ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *prev_recs, *sys_recs; ++ struct bch_nvmpg_ns *ns; ++ unsigned long pgoff; ++ int i; ++ ++ ns = global_nvmpg_set->ns_tbl[0]; ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ++ head = bch_nvmpg_offset_to_ptr(recs->head_offset); ++ prev_recs = recs; ++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET); ++ while (recs) { ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *rec = &(recs->recs[i]); ++ ++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) { ++ WARN_ON(rec->order != order); ++ rec->_v = 0; ++ recs->used--; ++ ++ if (recs->used == 0) { ++ int recs_pos = recs - sys_recs; ++ ++ if (recs == prev_recs) ++ head->recs_offset[ns_id] = ++ recs->next_offset; ++ else ++ prev_recs->next_offset = ++ recs->next_offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = 0; ++ ++ bitmap_clear(ns->recs_bitmap, recs_pos, 1); ++ } ++ goto out; ++ } ++ } ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++out: ++ return (recs ? 0 : -ENOENT); ++} ++ ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, ++ const char *uuid) ++{ ++ struct bch_nvmpg_ns *ns; ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *recs; ++ int r; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)]; ++ if (!ns) { ++ pr_err("can't find namespace by given kaddr from namespace\n"); ++ goto unlock; ++ } ++ ++ head = find_nvmpg_head(uuid, false); ++ if (!head) { ++ pr_err("can't found bch_nvmpg_head by uuid\n"); ++ goto unlock; ++ } ++ ++ recs = find_nvmpg_recs(ns, head, false); ++ if (!recs) { ++ pr_err("can't find bch_nvmpg_recs by uuid\n"); ++ goto unlock; ++ } ++ ++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order); ++ if (r < 0) { ++ pr_err("can't find bch_nvmpg_rec\n"); ++ goto unlock; ++ } ++ ++ __free_space(ns, nvmpg_offset, order); ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++} ++EXPORT_SYMBOL_GPL(bch_nvmpg_free_pages); ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +@@ -687,6 +837,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + ns->pages_offset = sb->pages_offset; + ns->pages_total = sb->pages_total; + ns->sb = sb; ++ /* increase by __free_space() */ + ns->free = 0; + ns->bdev = bdev; + ns->set = global_nvmpg_set; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index d03f3241b45a..e089936e7f13 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + + #else + +@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return 0; + } + ++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch new file mode 100644 index 0000000..9195841 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch @@ -0,0 +1,67 @@ +From 10a097e1408174b0fe3f029c37d7d512662a4582 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 21:06:03 +0800 +Subject: [PATCH 06/13] bcache: get recs list head for allocated pages by + specific uuid + +This patch implements bch_get_nvmpg_head() of the buddy allocator +to be used to get recs list head for allocated pages by specific +uuid. Then the requester (owner) can find all previous allocated +nvdimm pages by iterating the recs list. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 6 ++++++ + drivers/md/bcache/nvmpg.h | 6 ++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 7b86f08c219a..e4642e591f23 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -540,6 +540,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + } + EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); + ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return find_nvmpg_head(uuid, false); ++} ++EXPORT_SYMBOL_GPL(bch_get_nvmpg_head); ++ + static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) + { + return ns->base_addr + (ns->pages_total << PAGE_SHIFT); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index e089936e7f13..2361cabf18be 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -94,6 +94,7 @@ int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + + #else + +@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + + static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } + ++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch new file mode 100644 index 0000000..f240531 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch @@ -0,0 +1,48 @@ +From 1faf072bef28470d4d90e6ec5c42981b4b881ec0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:17:02 +0800 +Subject: [PATCH 07/13] bcache: use bucket index to set GC_MARK_METADATA for + journal buckets in bch_btree_gc_finish() + +Currently the meta data bucket locations on cache device are reserved +after the meta data stored on NVDIMM pages, for the meta data layout +consistentcy temporarily. So these buckets are still marked as meta data +by SET_GC_MARK() in bch_btree_gc_finish(). + +When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear +address of NVDIMM pages and not bucket index anymore. Therefore we +should avoid to find bucket index from sb.d[], and directly use bucket +index from ca->sb.first_bucket to (ca->sb.first_bucket + +ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/btree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 93b67b8d31c3..f7f844c321c3 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c) + ca = c->cache; + ca->invalidate_needs_gc = 0; + +- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) +- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); ++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */ ++ for (i = ca->sb.first_bucket; ++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++) ++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch new file mode 100644 index 0000000..794e12a --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch @@ -0,0 +1,60 @@ +From 497259154b1f79bfdaf967b21109521b301af534 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:18:31 +0800 +Subject: [PATCH 08/13] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into + incompat feature set + +This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the +incompat feature set. When this bit is set by bcache-tools, it indicates +bcache meta data should be stored on specific NVDIMM meta device. + +The bcache meta data mainly includes journal and btree nodes, when this +bit is set in incompat feature set, bcache will ask the nvm-pages +allocator for NVDIMM space to store the meta data. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/features.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h +index 09161b89c63e..fab92678be76 100644 +--- a/drivers/md/bcache/features.h ++++ b/drivers/md/bcache/features.h +@@ -18,11 +18,19 @@ + #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 + /* real bucket size is (1 << bucket_size) */ + #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 ++/* store bcache meta data on nvdimm */ ++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004 + + #define BCH_FEATURE_COMPAT_SUPP 0 + #define BCH_FEATURE_RO_COMPAT_SUPP 0 ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ ++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \ ++ BCH_FEATURE_INCOMPAT_NVDIMM_META) ++#else + #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) ++#endif + + #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ + + BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); + BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); ++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META); + + static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) + { +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch new file mode 100644 index 0000000..c8020e4 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch @@ -0,0 +1,255 @@ +From a0220c3b0138d021975ef1d5e29e07217626ff9e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 21 Oct 2021 21:39:18 +0800 +Subject: [PATCH 09/13] bcache: initialize bcache journal for NVDIMM meta + device + +The nvm-pages allocator may store and index the NVDIMM pages allocated +for bcache journal. This patch adds the initialization to store bcache +journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is +set by bcache-tools. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space() +will return the nvmpg_offset of NVDIMM pages for bcache journal, +- If there is previously allocated space, find it from nvm-pages owner + list and return to bch_journal_init(). +- If there is no previously allocated space, require a new NVDIMM range + from the nvm-pages allocator, and return it to bch_journal_init(). + +And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg +offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to +iterate all journal buckets. + +Later when bcache journaling code stores the journaling jset, the target +NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used +to calculate the linear address in memory copy from DRAM pages into +NVDIMM pages. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/journal.h | 2 +- + drivers/md/bcache/nvmpg.c | 9 +++ + drivers/md/bcache/nvmpg.h | 1 + + drivers/md/bcache/super.c | 18 +++--- + 5 files changed, 132 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 61bd79babf7a..d887557c718e 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -9,6 +9,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "nvmpg.h" ++#include "features.h" + + #include <trace/events/bcache.h> + +@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c) + + return 0; + } ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head, ++ struct cache *ca) ++{ ++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id; ++ unsigned long ret_offset = 0; ++ int i; ++ ++ jnl_offset = (unsigned long)ca->sb.d[0]; ++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset); ++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ struct bch_nvmpg_recs *recs; ++ struct bch_nvmpg_rec *rec; ++ unsigned long recs_offset = 0; ++ int j; ++ ++ recs_offset = nvmpg_head->recs_offset[i]; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ while (recs) { ++ for (j = 0; j < recs->size; j++) { ++ rec = &recs->recs[j]; ++ if ((rec->pgoff != jnl_pgoff) || ++ (rec->ns_id != jnl_ns_id)) ++ continue; ++ ++ ret_offset = jnl_offset; ++ goto out; ++ } ++ recs_offset = recs->next_offset; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ } ++ } ++ ++out: ++ return ret_offset; ++} ++ ++static unsigned long get_journal_nvmpg_space(struct cache *ca) ++{ ++ struct bch_nvmpg_head *head = NULL; ++ unsigned long nvmpg_offset; ++ int order; ++ ++ head = bch_get_nvmpg_head(ca->sb.set_uuid); ++ if (head) { ++ nvmpg_offset = find_journal_nvmpg_base(head, ca); ++ if (nvmpg_offset) ++ goto found; ++ } ++ ++ order = ilog2((ca->sb.bucket_size * ++ ca->sb.njournal_buckets) / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid); ++ if (nvmpg_offset) ++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), ++ 0, (1 << order) * PAGE_SIZE); ++found: ++ return nvmpg_offset; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static int __bch_journal_nvdimm_init(struct cache *ca) ++{ ++ int ret = -1; ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ int i; ++ unsigned long jnl_base = 0; ++ ++ jnl_base = get_journal_nvmpg_space(ca); ++ if (!jnl_base) { ++ pr_err("Failed to get journal space from nvdimm\n"); ++ goto out; ++ } ++ ++ /* Iniialized and reloaded from on-disk super block already */ ++ if (ca->sb.d[0] != 0) ++ goto out; ++ ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i); ++ ++ ret = 0; ++out: ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++ return ret; ++} ++ ++ ++int bch_journal_init(struct cache_set *c) ++{ ++ int i, ret = 0; ++ struct cache *ca = c->cache; ++ ++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, ++ 2, SB_JOURNAL_BUCKETS); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) { ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = ca->sb.first_bucket + i; ++ } else ++ ret = __bch_journal_nvdimm_init(ca); ++ ++ return ret; ++} +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..e3a7fa5a8fda 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list); + void bch_journal_meta(struct cache_set *c, struct closure *cl); + int bch_journal_read(struct cache_set *c, struct list_head *list); + int bch_journal_replay(struct cache_set *c, struct list_head *list); +- ++int bch_journal_init(struct cache_set *c); + void bch_journal_free(struct cache_set *c); + int bch_journal_alloc(struct cache_set *c); + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index e4642e591f23..142ad41e9c15 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -24,6 +24,15 @@ + + struct bch_nvmpg_set *global_nvmpg_set; + ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX)) ++ return global_nvmpg_set->ns_tbl[ns_id]; ++ ++ pr_emerg("Invalid ns_id: %d\n", ns_id); ++ return NULL; ++} ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset) + { + int ns_id = BCH_NVMPG_GET_NS_ID(offset); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 2361cabf18be..f7b7177cced3 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); + + #else + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 841d08e50191..990d5d6fe199 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * + goto err; + + err = "Journal buckets not sequential"; +- for (i = 0; i < sb->keys; i++) +- if (sb->d[i] != sb->first_bucket + i) +- goto err; ++ if (!bch_has_feature_nvdimm_meta(sb)) { ++ for (i = 0; i < sb->keys; i++) ++ if (sb->d[i] != sb->first_bucket + i) ++ goto err; ++ } + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) +@@ -2061,14 +2063,10 @@ static int run_cache_set(struct cache_set *c) + if (bch_journal_replay(c, &journal)) + goto err; + } else { +- unsigned int j; +- + pr_notice("invalidating existing data\n"); +- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, +- 2, SB_JOURNAL_BUCKETS); +- +- for (j = 0; j < ca->sb.keys; j++) +- ca->sb.d[j] = ca->sb.first_bucket + j; ++ err = "error initializing journal"; ++ if (bch_journal_init(c)) ++ goto err; + + bch_initial_gc_finish(c); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch new file mode 100644 index 0000000..6e105c6 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch @@ -0,0 +1,231 @@ +From a86e90383059c6d2a6972931127180b1fa174fbb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:45:23 +0800 +Subject: [PATCH 10/13] bcache: support storing bcache journal into NVDIMM meta + device + +This patch implements two methods to store bcache journal to, +1) __journal_write_unlocked() for block interface device + The latency method to compose bio and issue the jset bio to cache + device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache + device to store the journal jset. +2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM + Use memory interface to access NVDIMM pages and store the jset by + memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear + address from the NVDIMM pages to store the journal jset. + +For legacy configuration without NVDIMM meta device, journal I/O is +handled by __journal_write_unlocked() with existing code logic. If the +NVDIMM meta device is used (by bcache-tools), the journal I/O will +be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM +pages. + +And when NVDIMM meta device is used, sb.d[] stores the linear addresses +from NVDIMM pages (no more bucket index), in journal_reclaim() the +journaling location in c->journal.key.ptr[0] should also be updated by +linear address from NVDIMM pages (no more LBA combined by sectors offset +and bucket index). + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++----------- + drivers/md/bcache/super.c | 3 +- + 2 files changed, 85 insertions(+), 38 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index d887557c718e..7d5c5ed18890 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca) + return; + } + ++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb)); ++ + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; +@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c) + goto out; + + ja->cur_idx = next; +- k->ptr[0] = MAKE_PTR(0, +- bucket_to_sector(c, ca->sb.d[ja->cur_idx]), +- ca->sb.nr_this_dev); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ k->ptr[0] = MAKE_PTR(0, ++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ++ ca->sb.nr_this_dev); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr( ++ ca->sb.d[ja->cur_idx]); ++#endif ++ + atomic_long_inc(&c->reclaimed_journal_buckets); + + bkey_init(k); +@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + +-static void journal_write_unlocked(struct closure *cl) ++ ++static void __journal_write_unlocked(struct cache_set *c) + __releases(c->journal.lock) + { +- struct cache_set *c = container_of(cl, struct cache_set, journal.io); +- struct cache *ca = c->cache; +- struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; +- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * +- ca->sb.block_size; +- ++ struct journal_write *w = c->journal.cur; ++ struct closure *cl = &c->journal.io; ++ struct cache *ca = c->cache; + struct bio *bio; + struct bio_list list; ++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * ++ ca->sb.block_size; + + bio_list_init(&list); + +- if (!w->need_write) { +- closure_return_with_destructor(cl, journal_write_unlock); +- return; +- } else if (journal_full(&c->journal)) { +- journal_reclaim(c); +- spin_unlock(&c->journal.lock); +- +- btree_flush_write(c); +- continue_at(cl, journal_write, bch_journal_wq); +- return; +- } +- +- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); +- +- w->data->btree_level = c->root->level; +- +- bkey_copy(&w->data->btree_root, &c->root->key); +- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); +- +- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; +- w->data->magic = jset_magic(&ca->sb); +- w->data->version = BCACHE_JSET_VERSION; +- w->data->last_seq = last_seq(&c->journal); +- w->data->csum = csum_set(w->data); +- + for (i = 0; i < KEY_PTRS(k); i++) { + ca = c->cache; + bio = &ca->journal.bio; +@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl) + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } +- + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + +@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl) + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(c, bio, cl); ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static void __journal_nvdimm_write_unlocked(struct cache_set *c) ++ __releases(c->journal.lock) ++{ ++ struct journal_write *w = c->journal.cur; ++ struct cache *ca = c->cache; ++ unsigned int sectors; ++ ++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size; ++ atomic_long_add(sectors, &ca->meta_sectors_written); ++ ++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9); ++ ++ c->journal.key.ptr[0] += sectors << 9; ++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq; ++ ++ atomic_dec_bug(&fifo_back(&c->journal.pin)); ++ bch_journal_next(&c->journal); ++ journal_reclaim(c); ++ ++ spin_unlock(&c->journal.lock); ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static void journal_write_unlocked(struct closure *cl) ++{ ++ struct cache_set *c = container_of(cl, struct cache_set, journal.io); ++ struct cache *ca = c->cache; ++ struct journal_write *w = c->journal.cur; ++ ++ if (!w->need_write) { ++ closure_return_with_destructor(cl, journal_write_unlock); ++ return; ++ } else if (journal_full(&c->journal)) { ++ journal_reclaim(c); ++ spin_unlock(&c->journal.lock); ++ ++ btree_flush_write(c); ++ continue_at(cl, journal_write, bch_journal_wq); ++ return; ++ } ++ ++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); ++ ++ w->data->btree_level = c->root->level; ++ ++ bkey_copy(&w->data->btree_root, &c->root->key); ++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); ++ ++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; ++ w->data->magic = jset_magic(&ca->sb); ++ w->data->version = BCACHE_JSET_VERSION; ++ w->data->last_seq = last_seq(&c->journal); ++ w->data->csum = csum_set(w->data); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ __journal_write_unlocked(c); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ __journal_nvdimm_write_unlocked(c); ++#endif + + continue_at(cl, journal_write_done, NULL); + } +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 990d5d6fe199..42fd99406c60 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1672,7 +1672,7 @@ void bch_cache_set_release(struct kobject *kobj) + static void cache_set_free(struct closure *cl) + { + struct cache_set *c = container_of(cl, struct cache_set, cl); +- struct cache *ca; ++ struct cache *ca = c->cache; + + debugfs_remove(c->debug); + +@@ -1684,7 +1684,6 @@ static void cache_set_free(struct closure *cl) + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + +- ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch new file mode 100644 index 0000000..49ed5be --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch @@ -0,0 +1,181 @@ +From 29b95828f4804806bac44a795cba09ddc0cc0da0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:54:12 +0800 +Subject: [PATCH 11/13] bcache: read jset from NVDIMM pages for journal replay + +This patch implements two methods to read jset from media for journal +replay, +- __jnl_rd_bkt() for block device + This is the legacy method to read jset via block device interface. +- __jnl_rd_nvm_bkt() for NVDIMM + This is the method to read jset from NVDIMM memory interface, a.k.a + memcopy() from NVDIMM pages to DRAM pages. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set, +during running cache set, journal_read_bucket() will read the journal +content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of +NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which +were initialized and maintained in previous runs of the cache set. + +A thing should be noticed is, when bch_journal_read() is called, the +linear address of NVDIMM pages is not loaded and initialized yet, it +is necessary to call __bch_journal_nvdimm_init() before reading the jset +from NVDIMM pages. + +The code comments added in journal_read_bucket() is noticed by kernel +test robot and Dan Carpenter, it explains why it is safe to only check +!bch_has_feature_nvdimm_meta() condition in the if() statement when +CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the +bogus warning message from static checking tool. + +Signed-off-by: Coly Li <colyli@suse.de> +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++------- + 1 file changed, 71 insertions(+), 17 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7d5c5ed18890..902992be9191 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio) + closure_put(cl); + } + ++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset, ++ struct closure *cl) ++{ ++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]); ++ struct bio *bio = &ca->journal.bio; ++ struct jset *data = ca->set->journal.w[0].data; ++ ++ bio_reset(bio); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio_set_dev(bio, ca->bdev); ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = cl; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(ca->set, bio, cl); ++ closure_sync(cl); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset) ++{ ++ void *jset_addr; ++ struct jset *data; ++ ++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9); ++ data = ca->set->journal.w[0].data; ++ ++ memcpy(data, jset_addr, len << 9); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ + static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned int bucket_index) + { + struct journal_device *ja = &ca->journal; +- struct bio *bio = &ja->bio; + + struct journal_replay *i; +- struct jset *j, *data = ca->set->journal.w[0].data; ++ struct jset *j; + struct closure cl; + unsigned int len, left, offset = 0; + int ret = 0; +- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + +@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, + reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); + +- bio_reset(bio); +- bio->bi_iter.bi_sector = bucket + offset; +- bio_set_dev(bio, ca->bdev); +- bio->bi_iter.bi_size = len << 9; +- +- bio->bi_end_io = journal_read_endio; +- bio->bi_private = &cl; +- bio_set_op_attrs(bio, REQ_OP_READ, 0); +- bch_bio_map(bio, data); +- +- closure_bio_submit(ca->set, bio, &cl); +- closure_sync(&cl); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl); ++ /* ++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible ++ * support feature set, a cache device format with feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in ++ * read_super() by bch_has_unknown_incompat_features(). ++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is ++ * safe to ignore the bch_has_feature_nvdimm_meta() condition. ++ */ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset); ++#endif + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ +- +- j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); +@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int __bch_journal_nvdimm_init(struct cache *ca); ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + unsigned int i, l, r, m; + uint64_t seq; + ++ /* ++ * Linear addresses of NVDIMM pages for journaling is not ++ * initialized yet, do it before read jset from NVDIMM pages. ++ */ ++ if (bch_has_feature_nvdimm_meta(&ca->sb)) { ++ if (__bch_journal_nvdimm_init(ca) < 0) ++ return -ENXIO; ++ } ++ + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch new file mode 100644 index 0000000..e35c696 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch @@ -0,0 +1,84 @@ +From 286f425617ba71c2ff30930d010e0808dc41d953 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:55:25 +0800 +Subject: [PATCH 12/13] bcache: add sysfs interface register_nvdimm_meta to + register NVDIMM meta device + +This patch adds a sysfs interface register_nvdimm_meta to register +NVDIMM meta device. The sysfs interface file only shows up when +CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by +bcache-tools can be registered into bcache by e.g., + echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 42fd99406c60..723791250070 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2398,10 +2398,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); ++#endif + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); + kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta); ++#endif + + static bool bch_is_open_backing(dev_t dev) + { +@@ -2515,6 +2523,24 @@ static void register_device_async(struct async_reg_args *args) + queue_delayed_work(system_wq, &args->reg_work, 10); + } + ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr, ++ const char *buffer, size_t size) ++{ ++ ssize_t ret = size; ++ ++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer); ++ ++ if (IS_ERR(ns)) { ++ pr_err("register nvdimm namespace %s for meta device failed.\n", ++ buffer); ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++#endif ++ + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) + { +@@ -2857,6 +2883,9 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ &ksysfs_register_nvdimm_meta.attr, ++#endif + &ksysfs_pendings_cleanup.attr, + NULL + }; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch new file mode 100644 index 0000000..18fdf37 --- /dev/null +++ b/for-next/nvmpg-bcache-btree/draft/0013-store-btree-node-in-nvdimm.patch @@ -0,0 +1,489 @@ +From b0344cea65a7c816dbad1d4684a96dca929d8344 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 21 Oct 2021 22:54:20 +0800 +Subject: [PATCH 13/13] store btree node in nvdimm + +--- + drivers/md/bcache/alloc.c | 67 +++++++++++++++--- + drivers/md/bcache/bcache.h | 3 +- + drivers/md/bcache/bcache_ondisk.h | 2 +- + drivers/md/bcache/btree.c | 114 ++++++++++++++++++++++++++++-- + drivers/md/bcache/nvmpg.c | 50 +++++++++++++ + drivers/md/bcache/nvmpg.h | 52 ++++++++++++++ + drivers/md/bcache/super.c | 3 +- + 7 files changed, 273 insertions(+), 18 deletions(-) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index 097577ae3c47..9bdd6ee9e886 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -63,6 +63,7 @@ + + #include "bcache.h" + #include "btree.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/kthread.h> +@@ -477,12 +478,28 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) + } + } + ++void __bch_nvmpg_bucket_free(struct cache_set *c, struct bkey *k) ++{ ++ int order; ++ unsigned long nvmpg_offset; ++ ++ order = ilog2(c->cache->sb.bucket_size / PAGE_SECTORS); ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(k, 0)); ++ bch_nvmpg_free_pages(nvmpg_offset, order, c->set_uuid); ++} ++ + void bch_bucket_free(struct cache_set *c, struct bkey *k) + { + unsigned int i; + ++ if (KEY_NVMPG(k)) { ++ __bch_nvmpg_bucket_free(c, k); ++ return; ++ } ++ + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); ++ return; + } + + int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, +@@ -517,15 +534,31 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + return -1; + } + +-int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, +- struct bkey *k, bool wait) ++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k) + { +- int ret; ++ struct cache *ca; ++ unsigned long nvmpg_offset, bkey_offset; ++ int order; + +- mutex_lock(&c->bucket_lock); +- ret = __bch_bucket_alloc_set(c, reserve, k, wait); +- mutex_unlock(&c->bucket_lock); +- return ret; ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return -1; ++ ++ lockdep_assert_held(&c->bucket_lock); ++ ++ order = ilog2(ca->sb.bucket_size / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, c->set_uuid); ++ if (!nvmpg_offset) ++ goto err; ++ ++ bkey_offset = nvmpg_offset_to_bkey_offset(nvmpg_offset); ++ ++ bkey_init(k); ++ k->ptr[0] = MAKE_PTR(0, bkey_offset, ca->sb.nr_this_dev); ++ ++ SET_KEY_PTRS(k, 1); ++ return 0; ++err: ++ return -1; + } + + /* Sector allocator */ +@@ -537,6 +570,23 @@ struct open_bucket { + BKEY_PADDED(key); + }; + ++int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ++ struct bkey *k, bool wait, int bucket_type) ++{ ++ int ret; ++ ++ if (bucket_type == BCH_DATA_BUCKET) { ++ mutex_lock(&c->bucket_lock); ++ ret = __bch_bucket_alloc_set(c, reserve, k, wait); ++ mutex_unlock(&c->bucket_lock); ++ } else { ++ ret = __bch_nvmpg_bucket_alloc(c, k); ++ } ++ ++ return ret; ++} ++ ++ + /* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we try to segregate flash +@@ -631,7 +681,8 @@ bool bch_alloc_sectors(struct cache_set *c, + + spin_unlock(&c->data_bucket_lock); + +- if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) ++ if (bch_bucket_alloc_set(c, watermark, &alloc.key, ++ wait, BCH_DATA_BUCKET)) + return false; + + spin_lock(&c->data_bucket_lock); +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 9ed9c955add7..d54c3c3d8d7e 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -979,11 +979,12 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); + int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait); + int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, +- struct bkey *k, bool wait); ++ struct bkey *k, bool wait, int bucket_type); + bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, + unsigned int sectors, unsigned int write_point, + unsigned int write_prio, bool wait); + bool bch_cached_dev_error(struct cached_dev *dc); ++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k); + + __printf(2, 3) + bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); +diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h +index 97413586195b..6c890f632197 100644 +--- a/drivers/md/bcache/bcache_ondisk.h ++++ b/drivers/md/bcache/bcache_ondisk.h +@@ -45,7 +45,7 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ + KEY_FIELD(KEY_PTRS, high, 60, 3) + KEY_FIELD(__PAD0, high, 58, 2) + KEY_FIELD(KEY_CSUM, high, 56, 2) +-KEY_FIELD(__PAD1, high, 55, 1) ++KEY_FIELD(KEY_NVMPG, high, 55, 1) + KEY_FIELD(KEY_DIRTY, high, 36, 1) + + KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index f7f844c321c3..b8854905b93e 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -25,6 +25,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "features.h" ++#include "nvmpg.h" + + #include <linux/slab.h> + #include <linux/bitops.h> +@@ -240,14 +242,12 @@ static void btree_node_read_endio(struct bio *bio) + closure_put(cl); + } + +-static void bch_btree_node_read(struct btree *b) ++static void __bch_btree_node_read(struct btree *b) + { + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + +- trace_bcache_btree_read(b); +- + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); +@@ -278,6 +278,35 @@ static void bch_btree_node_read(struct btree *b) + PTR_BUCKET_NR(b->c, &b->key, 0)); + } + ++static void __bch_nvmpg_btree_node_read(struct btree *b) ++{ ++ uint64_t start_time = local_clock(); ++ void *ptr; ++ ++ ptr = bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0)); ++ memcpy(b->keys.set[0].data, ptr, KEY_SIZE(&b->key) << 9); ++ ++ if (btree_node_io_error(b)) ++ goto err; ++ ++ bch_btree_node_read_done(b); ++ bch_time_stats_update(&b->c->btree_read_time, start_time); ++ ++err: ++ bch_cache_set_error(b->c, ++ "io error reading NVDIMM pages at 0x%p\n", ptr); ++} ++ ++static void bch_btree_node_read(struct btree *b) ++{ ++ trace_bcache_btree_read(b); ++ ++ if (!KEY_NVMPG(&b->key)) ++ __bch_btree_node_read(b); ++ else ++ __bch_nvmpg_btree_node_read(b); ++} ++ + static void btree_complete_write(struct btree *b, struct btree_write *w) + { + if (w->prio_blocked && +@@ -335,7 +364,7 @@ static void btree_node_write_endio(struct bio *bio) + closure_put(cl); + } + +-static void do_btree_node_write(struct btree *b) ++static void __do_btree_node_write(struct btree *b) + { + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); +@@ -400,6 +429,68 @@ static void do_btree_node_write(struct btree *b) + } + } + ++static void btree_nvmpg_complete_write(struct btree *b, struct btree_write *w) ++{ ++ atomic_sub(w->prio_blocked, &b->c->prio_blocked); ++ ++ if (w->journal) { ++ atomic_dec_bug(w->journal); ++ __closure_wake_up(&b->c->journal.wait); ++ } ++ ++ w->prio_blocked = 0; ++ w->journal = NULL; ++} ++ ++static void btree_nvmpg_node_write_done(struct closure *cl) ++{ ++ struct btree *b = container_of(cl, struct btree, io); ++ struct btree_write *w = btree_prev_write(b); ++ ++ btree_nvmpg_complete_write(b, w); ++ ++ if (btree_node_dirty(b)) ++ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); ++ ++ closure_return_with_destructor(cl, btree_node_write_unlock); ++} ++ ++static void __do_nvmpg_btree_node_write(struct btree *b) ++{ ++ struct closure *cl = &b->io; ++ struct bset *i = btree_bset_last(b); ++ unsigned long nvmpg_offset; ++ void *nvmpg_ptr; ++ ++ i->version = BCACHE_BSET_VERSION; ++ i->csum = btree_csum_set(b, i); ++ ++ BUG_ON(b->bio); ++ ++ /* Calculate location to write */ ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(&b->key, 0)); ++ nvmpg_offset += roundup(set_bytes(i), block_bytes(b->c->cache)); ++ nvmpg_ptr = bch_nvmpg_offset_to_ptr(nvmpg_offset); ++ ++ memcpy_flushcache(nvmpg_ptr, i, ++ roundup(set_bytes(i), block_bytes(b->c->cache)) << 9); ++ ++ /* Update b->key to the wriitten location */ ++ SET_PTR_OFFSET(&b->key, 0, ++ nvmpg_offset_to_bkey_offset(nvmpg_offset)); ++ ++ closure_sync(cl); ++ continue_at_nobarrier(cl, btree_nvmpg_node_write_done, NULL); ++} ++ ++static void do_btree_node_write(struct btree *b) ++{ ++ if (!KEY_NVMPG(&b->key)) ++ __do_btree_node_write(b); ++ else ++ __do_nvmpg_btree_node_write(b); ++} ++ + void __bch_btree_node_write(struct btree *b, struct closure *parent) + { + struct bset *i = btree_bset_last(b); +@@ -1094,10 +1185,19 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + + mutex_lock(&c->bucket_lock); + retry: +- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) ++ /* ++ * If nvdimm_meta feature is enabled, try to allocate btree ++ * node from NVDIMM pages and set KEY_NVMPG bit successfully. ++ */ ++ if (bch_has_feature_nvdimm_meta(&(c->cache->sb))) ++ __bch_nvmpg_bucket_alloc(c, &k.key); ++ ++ if (!KEY_NVMPG(&k.key) && ++ __bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) + goto err; + +- bkey_put(c, &k.key); ++ if (!KEY_NVMPG(&k.key)) ++ bkey_put(c, &k.key); + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, op, &k.key, level); +@@ -1118,7 +1218,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + trace_bcache_btree_node_alloc(b); + return b; + err_free: +- bch_bucket_free(c, &k.key); ++ bch_bucket_free(c, &k.key); + err: + mutex_unlock(&c->bucket_lock); + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 142ad41e9c15..12d67e535854 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -91,6 +91,56 @@ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + return bch_nvmpg_pgoff_to_ptr(ns, pgoff); + } + ++static void bug_on_bkey_offset_limit(unsigned long sector) ++{ ++ if (sector >= ((1UL << BCH_BKEY_OFFSET_BITS) - 1)) { ++ pr_err("Invalid NVDIMM offset: too large as 0x%lx\n", ++ sector); ++ pr_err("Such condition should never happen. Panic.\n"); ++ BUG(); ++ } ++} ++ ++int bkey_offset_to_nvmpg_ns_id(unsigned long bkey_offset) ++{ ++ return (bkey_offset >> BCH_BKEY_OFFSET_BITS) & ++ BCH_BKEY_OFFSET_NS_ID_MASK; ++} ++ ++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset) ++{ ++ int ns_id; ++ unsigned long offset; ++ ++ ns_id = (bkey_offset >> BCH_BKEY_OFFSET_BITS) & ++ BCH_BKEY_OFFSET_NS_ID_MASK; ++ ++ offset = (bkey_offset & BCH_BKEY_OFFSET_MASK) << 9; ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset) ++{ ++ int ns_id; ++ unsigned long sector; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(nvmpg_offset); ++ sector = BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> 9; ++ bug_on_bkey_offset_limit(sector); ++ ++ return ((sector & BCH_BKEY_OFFSET_MASK) | ++ ((ns_id & BCH_BKEY_OFFSET_NS_ID_MASK) << BCH_BKEY_OFFSET_BITS)); ++} ++ ++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset) ++{ ++ unsigned long nvmpg_offset; ++ ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(bkey_offset); ++ return bch_nvmpg_offset_to_ptr(nvmpg_offset); ++} ++ + static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, + pgoff_t pgoff, u64 nr) + { +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index f7b7177cced3..7f6d8e6f9dff 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -84,6 +84,21 @@ struct bch_nvmpg_set { + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ + sizeof(struct bch_nvmpg_recs))) + ++ ++/* For bkey PTR_OFFSET to nvmpg namespace ID and offset convertion. ++ * ++ * PTR_OFFSET is 43 bits, the most significant 3 bits are for ++ * namespace ID. Rested 40 bits are for per-namespace offset ++ * in sectors. ++ */ ++#define BCH_BKEY_OFFSET_NS_ID_BITS 3 ++#define BCH_BKEY_OFFSET_NS_ID_MASK ((1UL<<BCH_BKEY_OFFSET_NS_ID_BITS) - 1) ++#define BCH_BKEY_OFFSET_BITS 40 ++#define BCH_BKEY_OFFSET_MASK ((1UL<<BCH_BKEY_OFFSET_BITS) - 1) ++ ++#define BCH_DATA_BUCKET 0 ++#define BCH_META_BUCKET 1 ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset); + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + +@@ -96,6 +111,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); ++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset); ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); ++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset); ++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset); ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + + #else + +@@ -123,6 +144,37 @@ static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) + return NULL; + } + ++static inline void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset) ++{ ++ return NULL; ++} ++ ++static inline struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ return NULL; ++} ++ ++static inline unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset) ++{ ++ return 0; ++} ++ ++static inline unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset) ++{ ++ return 0; ++} ++ ++static inline void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ return NULL; ++} ++ ++static inline unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ return 0; ++} ++ ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 723791250070..64b517e8136a 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -512,7 +512,8 @@ static int __uuid_write(struct cache_set *c) + closure_init_stack(&cl); + lockdep_assert_held(&bch_register_lock); + +- if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) ++ if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, ++ true, BCH_META_BUCKET)) + return 1; + + size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch new file mode 100644 index 0000000..a3700f6 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0001-bcache-add-initial-data-structures-for-nvm-pages.patch @@ -0,0 +1,343 @@ +From bbb3b719dfc6070a5807bf6494f858e9e2f4f609 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 26 Jul 2021 00:26:28 +0800 +Subject: [PATCH 01/12] bcache: add initial data structures for nvm pages + +This patch initializes the prototype data structures for nvm pages +allocator, + +- struct bch_nvmpg_sb + This is the super block allocated on each nvdimm namespace for the nvm +pages allocator. A nvdimm pages allocator set may have multiple name- +spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this +namespace belongs to. + +- struct bch_nvmpg_header + This is a table for all heads of all allocation record lists. An allo- +cation record list traces all page(s) allocated from nvdimm namespace(s) +to a specific requester (identified by uuid). After system reboot, a +requester can retrieve all previously allocated nvdimm pages from its +record list by a pre-defined uuid. + +- struct bch_nvmpg_head + This is a head of an allocation record list. Each nvdimm pages +requester (typically it's a driver) has and only has one allocation +record list, and an allocated nvdimm page only belongs to a specific +allocation record list. Member uuid[] will be set as the requester's +uuid, e.g. for bcache it is the cache set uuid. Member label is not +mandatory, it is a human-readable string for debug purpose. The nvm +offset format pointers recs_offset[] point to the location of actual +allocator record lists on each namespace of the nvdimm pages allocator +set. Each per namespace record list is represented by the following +struct bch_nvmpg_recs. + +- struct bch_nvmpg_recs + This structure represents a requester's allocation record list. Member +uuid is same value as the uuid of its corresponding struct +bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec +objects to trace all allocated nvmdimm pages. If the table recs[] is +full, the nvmpg format offset is a pointer points to the next struct +bch_nvmpg_recs object, nvm pages allocator will look for available free +allocation record there. All the linked struct bch_nvmpg_recs objects +compose a requester's allocation record list which is headed by the +above struct bch_nvmpg_head. + +- struct bch_nvmpg_rec + This structure records a range of allocated nvdimm pages. Member pgoff +is offset in unit of page size of this allocation range. Member order +indicates size of the allocation range by (1 << order) in unit of page +size. Because the nvdimm pages allocator set may have multiple nvdimm +namespaces, member ns_id is used to identify which namespace the pgoff +belongs to. + - Bits 0 - 51: pgoff - is pages offset of the allocated pages. + - Bits 52 - 57: order - allocated size in page_size * order-of-2 + - Bits 58 - 60: ns_id - identify which namespace the pages stays on + - Bits 61 - 63: reserved. +Since each of the allocated nvm pages are power of 2, using 6 bits to +represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum +value. It can be a 76 bits width range size in byte for 4KB page size, +which is large enough currently. + +All the structure members having _offset suffix are in a special format. +E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset}, +bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset}, +the offset value is 64bit, the most significant 3 bits are used to +identify which namespace this offset belongs to, and the rested 61 bits +are actual offset inside the namespace. Following patches will have +helper routines to do the conversion between memory pointer and offset. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Ying Huang <ying.huang@intel.com> +--- + drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++ + 1 file changed, 253 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg_format.h + +diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h +new file mode 100644 +index 000000000000..e9eb6371fd78 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg_format.h +@@ -0,0 +1,253 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ ++#ifndef _NVMPG_FORMAT_H ++#define _NVMPG_FORMAT_H ++ ++/* ++ * Bcache on NVDIMM data structures ++ */ ++ ++/* ++ * - struct bch_nvmpg_sb ++ * This is the super block allocated on each nvdimm namespace for the nvm ++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces, ++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space ++ * belongs to. ++ * ++ * - struct bch_nvmpg_header ++ * This is a table for all heads of all allocation record lists. An allo- ++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to ++ * a specific requester (identified by uuid). After system reboot, a requester ++ * can retrieve all previously allocated nvdimm pages from its record list by a ++ * pre-defined uuid. ++ * ++ * - struct bch_nvmpg_head ++ * This is a head of an allocation record list. Each nvdimm pages requester ++ * (typically it's a driver) has and only has one allocation record list, and ++ * an allocated nvdimm page only bedlones to a specific allocation record list. ++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the ++ * cache set uuid. Member label is not mandatory, it is a human-readable string ++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the ++ * location of actual allocator record lists on each name space of the nvdimm ++ * pages allocator set. Each per name space record list is represented by the ++ * following struct bch_nvmpg_recs. ++ * ++ * - struct bch_nvmpg_recs ++ * This structure represents a requester's allocation record list. Member uuid ++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member ++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated ++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a ++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator ++ * will look for available free allocation record there. All the linked ++ * struct bch_nvmpg_recs objects compose a requester's allocation record list ++ * which is headed by the above struct bch_nvmpg_head. ++ * ++ * - struct bch_nvmpg_rec ++ * This structure records a range of allocated nvdimm pages. Member pgoff is ++ * offset in unit of page size of this allocation range. Member order indicates ++ * size of the allocation range by (1 << order) in unit of page size. Because ++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member ++ * ns_id is used to identify which name space the pgoff belongs to. ++ * ++ * All allocation record lists are stored on the first initialized nvdimm name- ++ * space (ns_id 0). The meta data default layout of nvm pages allocator on ++ * namespace 0 is, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET ++ * | bch_nvmpg_header | ++ * | | ++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET ++ * | bch_nvmpg_recs | ++ * | (nvm pages internal usage) | ++ * 24KB +---------------------------------+ ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * ++ * Meta data default layout on rested nvdimm namespaces, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * - The nvmpg offset format pointer ++ * All member names ending with _offset in this header are nvmpg offset ++ * format pointer. The offset format is, ++ * [highest 3 bits: ns_id] ++ * [rested 61 bits: offset in No. ns_id namespace] ++ * ++ * The above offset is byte unit, the procedure to reference a nvmpg offset ++ * format pointer is, ++ * 1) Identify the namespace related in-memory structure by ns_id from the ++ * highest 3 bits of offset value. ++ * 2) Get the DAX mapping base address from the in-memory structure. ++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base ++ * address with offset value in rested low 61 bits. ++ * All related in-memory structure and conversion routines don't belong to ++ * user space api, they are defined by nvm-pages allocator code in ++ * drivers/md/bcache/nvm-pages.{c,h} ++ * ++ */ ++ ++#include <linux/types.h> ++ ++/* In sectors */ ++#define BCH_NVMPG_SB_OFFSET 4096 ++#define BCH_NVMPG_START (16 << 20) ++ ++#define BCH_NVMPG_LBL_SIZE 32 ++#define BCH_NVMPG_NS_MAX 8 ++ ++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10) ++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10) ++ ++#define BCH_NVMPG_SB_VERSION 0 ++#define BCH_NVMPG_SB_VERSION_MAX 0 ++ ++static const __u8 bch_nvmpg_magic[] = { ++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83, ++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 }; ++static const __u8 bch_nvmpg_recs_magic[] = { ++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9, ++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae }; ++ ++/* takes 64bit width */ ++struct bch_nvmpg_rec { ++ union { ++ struct { ++ __u64 pgoff:52; ++ __u64 order:6; ++ __u64 ns_id:3; ++ __u64 reserved:3; ++ }; ++ __u64 _v; ++ }; ++}; ++ ++struct bch_nvmpg_recs { ++ union { ++ struct { ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvmpg_head ++ */ ++ __u64 head_offset; ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvm_pgalloc_recs which contains ++ * the next recs[] array. ++ */ ++ __u64 next_offset; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_rec recs[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_RECS \ ++ ((sizeof(struct bch_nvmpg_recs) - \ ++ offsetof(struct bch_nvmpg_recs, recs)) / \ ++ sizeof(struct bch_nvmpg_rec)) ++ ++#define BCH_NVMPG_HD_STAT_FREE 0x0 ++#define BCH_NVMPG_HD_STAT_ALLOC 0x1 ++struct bch_nvmpg_head { ++ __u8 uuid[16]; ++ __u8 label[BCH_NVMPG_LBL_SIZE]; ++ __u32 state; ++ __u32 flags; ++ /* ++ * Array of offset values from the nvmpg offset format ++ * pointers, each of the pointer points to a per-namespace ++ * struct bch_nvmpg_recs. ++ */ ++ __u64 recs_offset[BCH_NVMPG_NS_MAX]; ++}; ++ ++/* heads[0] is always for nvm_pages internal usage */ ++struct bch_nvmpg_set_header { ++ union { ++ struct { ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_head heads[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_HEADS \ ++ ((sizeof(struct bch_nvmpg_set_header) - \ ++ offsetof(struct bch_nvmpg_set_header, heads)) / \ ++ sizeof(struct bch_nvmpg_head)) ++ ++/* The on-media bit order is local CPU order */ ++struct bch_nvmpg_sb { ++ __u64 csum; ++ __u64 sb_offset; ++ __u64 ns_start; ++ __u64 version; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 page_size; ++ __u32 total_ns; ++ __u32 this_ns; ++ union { ++ __u8 set_uuid[16]; ++ __u64 set_magic; ++ }; ++ ++ __u64 flags; ++ __u64 seq; ++ ++ __u64 feature_compat; ++ __u64 feature_incompat; ++ __u64 feature_ro_compat; ++ ++ /* For allocable nvm pages from buddy systems */ ++ __u64 pages_offset; ++ __u64 pages_total; ++ ++ __u64 pad[8]; ++ ++ /* ++ * A nvmpg offset format pointer, it points ++ * to struct bch_nvmpg_set_header which is ++ * stored only on the first name space. ++ */ ++ __u64 set_header_offset; ++ ++ /* Just for csum_set() */ ++ __u32 keys; ++ __u64 d[0]; ++}; ++ ++#endif /* _NVMPG_FORMAT_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch new file mode 100644 index 0000000..ff4445c --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0002-bcache-initialize-the-nvm-pages-allocator.patch @@ -0,0 +1,542 @@ +From a13fa68537fa67df106e366c0e1cd35d4e715feb Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 26 Jul 2021 10:33:30 +0800 +Subject: [PATCH 02/12] bcache: initialize the nvm pages allocator + +This patch define the prototype data structures in memory and +initializes the nvm pages allocator. + +The nvm address space which is managed by this allocator can consist of +many nvm namespaces, and some namespaces can compose into one nvm set, +like cache set. For this initial implementation, only one set can be +supported. + +The users of this nvm pages allocator need to call register_namespace() +to register the nvdimm device (like /dev/pmemX) into this allocator as +the instance of struct nvm_namespace. + +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/Kconfig | 10 ++ + drivers/md/bcache/Makefile | 1 + + drivers/md/bcache/nvmpg.c | 340 +++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 97 +++++++++++ + drivers/md/bcache/super.c | 3 + + 5 files changed, 451 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg.c + create mode 100644 drivers/md/bcache/nvmpg.h + +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..4a7c13e882bb 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. ++ ++config BCACHE_NVM_PAGES ++ bool "NVDIMM support for bcache (EXPERIMENTAL)" ++ depends on BCACHE ++ depends on 64BIT ++ depends on LIBNVDIMM ++ depends on DAX ++ help ++ Allocate/release NV-memory pages for bcache and provide allocated pages ++ for each requestor after system reboot. +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..276b33be5ad5 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o + bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o ++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +new file mode 100644 +index 000000000000..1dd321e4c280 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.c +@@ -0,0 +1,340 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Nvdimm page-buddy allocator ++ * ++ * Copyright (c) 2021, Intel Corporation. ++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>. ++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>. ++ */ ++ ++#include "bcache.h" ++#include "nvmpg.h" ++ ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/mutex.h> ++#include <linux/dax.h> ++#include <linux/pfn_t.h> ++#include <linux/libnvdimm.h> ++#include <linux/mm_types.h> ++#include <linux/err.h> ++#include <linux/pagemap.h> ++#include <linux/bitmap.h> ++#include <linux/blkdev.h> ++ ++struct bch_nvmpg_set *global_nvmpg_set; ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ int ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (offset == 0) ++ return NULL; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (ns) ++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset)); ++ ++ pr_err("Invalid ns_id %u\n", ns_id); ++ return NULL; ++} ++ ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = (unsigned long)(ptr - ns->base_addr); ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++static void release_ns_tbl(struct bch_nvmpg_set *set) ++{ ++ int i; ++ struct bch_nvmpg_ns *ns; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ ns = set->ns_tbl[i]; ++ if (ns) { ++ fs_put_dax(ns->dax_dev); ++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); ++ set->ns_tbl[i] = NULL; ++ set->attached_ns--; ++ kfree(ns); ++ } ++ } ++ ++ if (set->attached_ns) ++ pr_err("unexpected attached_ns: %u\n", set->attached_ns); ++} ++ ++static void release_nvmpg_set(struct bch_nvmpg_set *set) ++{ ++ release_ns_tbl(set); ++ kfree(set); ++} ++ ++/* Namespace 0 contains all meta data of the nvmpg allocation set */ ++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_set_header *set_header; ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n", ++ ns->ns_id); ++ return -EINVAL; ++ } ++ ++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset); ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ global_nvmpg_set->set_header = set_header; ++ global_nvmpg_set->heads_size = set_header->size; ++ global_nvmpg_set->heads_used = set_header->used; ++ mutex_unlock(&global_nvmpg_set->lock); ++ ++ return 0; ++} ++ ++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_sb *sb = ns->sb; ++ int rc = 0; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) { ++ pr_err("ns_id %u already attached.\n", ns->ns_id); ++ rc = -EEXIST; ++ goto unlock; ++ } ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ if (global_nvmpg_set->attached_ns > 0) { ++ pr_err("multiple namespace attaching not supported yet\n"); ++ rc = -EOPNOTSUPP; ++ goto unlock; ++ } ++ ++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) { ++ pr_err("namespace counters error: attached %u > total %u\n", ++ global_nvmpg_set->attached_ns, ++ global_nvmpg_set->total_ns); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16); ++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns; ++ global_nvmpg_set->attached_ns++; ++ global_nvmpg_set->total_ns = sb->total_ns; ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} ++ ++static int read_nvdimm_meta_super(struct block_device *bdev, ++ struct bch_nvmpg_ns *ns) ++{ ++ struct page *page; ++ struct bch_nvmpg_sb *sb; ++ uint64_t expected_csum = 0; ++ int r; ++ ++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, ++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); ++ ++ if (IS_ERR(page)) ++ return -EIO; ++ ++ sb = (struct bch_nvmpg_sb *) ++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET)); ++ ++ r = -EINVAL; ++ expected_csum = csum_set(sb); ++ if (expected_csum != sb->csum) { ++ pr_info("csum is not match with expected one\n"); ++ goto put_page; ++ } ++ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_info("invalid bch_nvmpg_magic\n"); ++ goto put_page; ++ } ++ ++ if (sb->sb_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) { ++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset); ++ goto put_page; ++ } ++ ++ r = -EOPNOTSUPP; ++ if (sb->total_ns != 1) { ++ pr_info("multiple name space not supported yet.\n"); ++ goto put_page; ++ } ++ ++ ++ r = 0; ++ /* Necessary for DAX mapping */ ++ ns->page_size = sb->page_size; ++ ns->pages_total = sb->pages_total; ++ ++put_page: ++ put_page(page); ++ return r; ++} ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ struct bch_nvmpg_ns *ns = NULL; ++ struct bch_nvmpg_sb *sb = NULL; ++ char buf[BDEVNAME_SIZE]; ++ struct block_device *bdev; ++ pgoff_t pgoff; ++ int id, err; ++ char *path; ++ long dax_ret = 0; ++ ++ path = kstrndup(dev_path, 512, GFP_KERNEL); ++ if (!path) { ++ pr_err("kstrndup failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ bdev = blkdev_get_by_path(strim(path), ++ FMODE_READ|FMODE_WRITE|FMODE_EXCL, ++ global_nvmpg_set); ++ if (IS_ERR(bdev)) { ++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev)); ++ kfree(path); ++ return ERR_PTR(PTR_ERR(bdev)); ++ } ++ ++ err = -ENOMEM; ++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL); ++ if (!ns) ++ goto bdput; ++ ++ err = -EIO; ++ if (read_nvdimm_meta_super(bdev, ns)) { ++ pr_err("%s read nvdimm meta super block failed.\n", ++ bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EOPNOTSUPP; ++ ns->dax_dev = fs_dax_get_by_bdev(bdev); ++ if (!ns->dax_dev) { ++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0, ++ bdev_nr_sectors(bdev))) { ++ pr_err("%s don't support DAX\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) { ++ pr_err("invalid offset of %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ id = dax_read_lock(); ++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total, ++ &ns->base_addr, &ns->start_pfn); ++ if (dax_ret <= 0) { ++ pr_err("dax_direct_access error\n"); ++ dax_read_unlock(id); ++ goto free_ns; ++ } ++ ++ if (dax_ret < ns->pages_total) { ++ pr_warn("currently first %ld pages (from %lu in total) are used\n", ++ dax_ret, ns->pages_total); ++ } ++ dax_read_unlock(id); ++ ++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET); ++ ++ err = -EINVAL; ++ /* Check magic again to make sure DAX mapping is correct */ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n"); ++ goto free_ns; ++ } ++ ++ if ((global_nvmpg_set->attached_ns > 0) && ++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) { ++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id); ++ goto free_ns; ++ } ++ ++ if (sb->set_header_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) { ++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n", ++ sb->this_ns, ++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset), ++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset)); ++ goto free_ns; ++ } ++ ++ ns->page_size = sb->page_size; ++ ns->pages_offset = sb->pages_offset; ++ ns->pages_total = sb->pages_total; ++ ns->sb = sb; ++ ns->free = 0; ++ ns->bdev = bdev; ++ ns->set = global_nvmpg_set; ++ ++ err = attach_nvmpg_set(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ mutex_init(&ns->lock); ++ ++ err = init_nvmpg_set_header(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ kfree(path); ++ return ns; ++ ++free_ns: ++ fs_put_dax(ns->dax_dev); ++ kfree(ns); ++bdput: ++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); ++ kfree(path); ++ return ERR_PTR(err); ++} ++ ++int __init bch_nvmpg_init(void) ++{ ++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL); ++ if (!global_nvmpg_set) ++ return -ENOMEM; ++ ++ global_nvmpg_set->total_ns = 0; ++ mutex_init(&global_nvmpg_set->lock); ++ ++ pr_info("bcache nvm init\n"); ++ return 0; ++} ++ ++void bch_nvmpg_exit(void) ++{ ++ release_nvmpg_set(global_nvmpg_set); ++ pr_info("bcache nvm exit\n"); ++} +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +new file mode 100644 +index 000000000000..698c890b2d15 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _BCACHE_NVM_PAGES_H ++#define _BCACHE_NVM_PAGES_H ++ ++#include <linux/libnvdimm.h> ++ ++#include "nvmpg_format.h" ++ ++/* ++ * Bcache NVDIMM in memory data structures ++ */ ++ ++/* ++ * The following three structures in memory records which page(s) allocated ++ * to which owner. After reboot from power failure, they will be initialized ++ * based on nvm pages superblock in NVDIMM device. ++ */ ++struct bch_nvmpg_ns { ++ struct bch_nvmpg_sb *sb; ++ void *base_addr; ++ ++ unsigned char uuid[16]; ++ int ns_id; ++ unsigned int page_size; ++ unsigned long free; ++ unsigned long pages_offset; ++ unsigned long pages_total; ++ pfn_t start_pfn; ++ ++ struct dax_device *dax_dev; ++ struct block_device *bdev; ++ struct bch_nvmpg_set *set; ++ ++ struct mutex lock; ++}; ++ ++/* ++ * A set of namespaces. Currently only one set can be supported. ++ */ ++struct bch_nvmpg_set { ++ unsigned char set_uuid[16]; ++ ++ int heads_size; ++ int heads_used; ++ struct bch_nvmpg_set_header *set_header; ++ ++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX]; ++ int total_ns; ++ int attached_ns; ++ ++ struct mutex lock; ++}; ++ ++#define BCH_NVMPG_NS_ID_BITS 3 ++#define BCH_NVMPG_OFFSET_BITS 61 ++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1) ++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1) ++ ++#define BCH_NVMPG_GET_NS_ID(offset) \ ++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK) ++ ++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK) ++ ++#define BCH_NVMPG_OFFSET(ns_id, offset) \ ++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \ ++ ((offset) & BCH_NVMPG_OFFSET_MASK)) ++ ++/* Indicate which field in bch_nvmpg_sb to be updated */ ++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); ++int bch_nvmpg_init(void); ++void bch_nvmpg_exit(void); ++ ++#else ++ ++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ return NULL; ++} ++ ++static inline int bch_nvmpg_init(void) ++{ ++ return 0; ++} ++ ++static inline void bch_nvmpg_exit(void) { } ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++#endif /* _BCACHE_NVM_PAGES_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 86b9e355c583..74d51a0b806f 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -14,6 +14,7 @@ + #include "request.h" + #include "writeback.h" + #include "features.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/pagemap.h> +@@ -2818,6 +2819,7 @@ static void bcache_exit(void) + { + bch_debug_exit(); + bch_request_exit(); ++ bch_nvmpg_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) +@@ -2916,6 +2918,7 @@ static int __init bcache_init(void) + + bch_debug_init(); + closure_debug_init(); ++ bch_nvmpg_init(); + + bcache_is_reboot = false; + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch new file mode 100644 index 0000000..784b84b --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0003-bcache-initialization-of-the-buddy.patch @@ -0,0 +1,359 @@ +From eabc025702499684f588f362099f47998d0fde63 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:45:57 +0800 +Subject: [PATCH 03/12] bcache: initialization of the buddy + +This nvm pages allocator will implement the simple buddy allocator to +anage the nvm address space. This patch initializes this buddy allocator +for new namespace. + +the unit of alloc/free of the buddy allocator is page. DAX device has +their struct page(in dram or PMEM). + + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + void *zone_device_data; + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ + }; + +ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use. +So the second/third word will be used as 'struct list_head ' which list +in buddy. The fourth word(that is normal struct page::index) store pgoff +which the page-offset in the dax device. And the fifth word (that is +normal struct page::private) store order of buddy. page_type will be used +to store buddy flags. + +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/nvmpg.h | 12 +++ + 2 files changed, 221 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 1dd321e4c280..80e12e06f6d3 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + return BCH_NVMPG_OFFSET(ns_id, offset); + } + ++static struct page *bch_nvmpg_va_to_pg(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) ++{ ++ return ns->base_addr + (pgoff << PAGE_SHIFT); ++} ++ ++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) ++{ ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ pgoff_t pgoff = r->pgoff; ++ ++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++} ++ ++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff, u64 nr) ++{ ++ while (nr > 0) { ++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr; ++ ++ bitmap_set(ns->pages_bitmap, pgoff, num); ++ nr -= num; ++ pgoff += num; ++ } ++} ++ + static void release_ns_tbl(struct bch_nvmpg_set *set) + { + int i; +@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set) + for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { + ns = set->ns_tbl[i]; + if (ns) { ++ kvfree(ns->pages_bitmap); ++ if (ns->recs_bitmap) ++ bitmap_free(ns->recs_bitmap); ++ + fs_put_dax(ns->dax_dev); + blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + set->ns_tbl[i] = NULL; +@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set) + kfree(set); + } + ++static int validate_recs(int ns_id, ++ struct bch_nvmpg_head *head, ++ struct bch_nvmpg_recs *recs) ++{ ++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) { ++ pr_err("Invalid bch_nvmpg_recs magic\n"); ++ return -EINVAL; ++ } ++ ++ if (memcmp(recs->uuid, head->uuid, 16)) { ++ pr_err("Invalid bch_nvmpg_recs uuid\n"); ++ return -EINVAL; ++ } ++ ++ if (recs->head_offset != ++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) { ++ pr_err("Invalid recs head_offset\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs) ++{ ++ int i, used = 0; ++ ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *r = &recs->recs[i]; ++ struct bch_nvmpg_ns *ns; ++ struct page *page; ++ void *addr; ++ ++ if (r->pgoff == 0) ++ continue; ++ ++ ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ addr = bch_nvmpg_rec_to_ptr(r); ++ if (addr < ns->base_addr) { ++ pr_err("Invalid recorded address\n"); ++ return -EINVAL; ++ } ++ ++ /* init struct page: index/private */ ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, r->order); ++ page->index = r->pgoff; ++ ++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order); ++ used++; ++ } ++ ++ if (used != recs->used) { ++ pr_err("used %d doesn't match recs->used %d\n", ++ used, recs->used); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ + /* Namespace 0 contains all meta data of the nvmpg allocation set */ + static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_set_header *set_header; ++ struct bch_nvmpg_recs *sys_recs; ++ int i, j, used = 0, rc = 0; + + if (ns->ns_id != 0) { + pr_err("unexpected ns_id %u for first nvmpg namespace.\n", +@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + global_nvmpg_set->set_header = set_header; + global_nvmpg_set->heads_size = set_header->size; + global_nvmpg_set->heads_used = set_header->used; ++ ++ /* Reserve the used space from buddy allocator */ ++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size)); ++ ++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET; ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *head; ++ ++ head = &set_header->heads[i]; ++ if (head->state == BCH_NVMPG_HD_STAT_FREE) ++ continue; ++ ++ used++; ++ if (used > global_nvmpg_set->heads_size) { ++ pr_err("used heads %d > heads size %d.\n", ++ used, global_nvmpg_set->heads_size); ++ goto unlock; ++ } ++ ++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]); ++ ++ /* Iterate the recs list */ ++ while (recs) { ++ rc = validate_recs(j, head, recs); ++ if (rc < 0) ++ goto unlock; ++ ++ rc = reserve_nvmpg_recs(recs); ++ if (rc < 0) ++ goto unlock; ++ ++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1); ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ } ++ } ++unlock: + mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} + +- return 0; ++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) ++{ ++ unsigned int start, end, pages; ++ int i; ++ struct page *page; ++ pgoff_t pgoff_start; ++ ++ bitmap_for_each_clear_region(ns->pages_bitmap, ++ start, end, 0, ns->pages_total) { ++ pgoff_start = start; ++ pages = end - start; ++ ++ while (pages) { ++ void *addr; ++ ++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { ++ if ((pgoff_start % (1L << i) == 0) && ++ (pages >= (1L << i))) ++ break; ++ } ++ ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, i); ++ page->index = pgoff_start; ++ __SetPageBuddy(page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[i]); ++ ++ pgoff_start += 1L << i; ++ pages -= 1L << i; ++ } ++ } + } + + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) +@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + char buf[BDEVNAME_SIZE]; + struct block_device *bdev; + pgoff_t pgoff; +- int id, err; ++ int id, i, err; + char *path; + long dax_ret = 0; + +@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + + mutex_init(&ns->lock); + ++ /* ++ * parameters of bitmap_set/clear are unsigned int. ++ * Given currently size of nvm is far from exceeding this limit, ++ * so only add a WARN_ON message. ++ */ ++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX); ++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!ns->pages_bitmap) { ++ err = -ENOMEM; ++ goto clear_ns_nr; ++ } ++ ++ if (ns->sb->this_ns == 0) { ++ ns->recs_bitmap = ++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL); ++ if (ns->recs_bitmap == NULL) { ++ err = -ENOMEM; ++ goto free_pages_bitmap; ++ } ++ } ++ ++ for (i = 0; i < BCH_MAX_ORDER; i++) ++ INIT_LIST_HEAD(&ns->free_area[i]); ++ + err = init_nvmpg_set_header(ns); + if (err < 0) +- goto free_ns; ++ goto free_recs_bitmap; ++ ++ if (ns->sb->this_ns == 0) ++ /* init buddy allocator */ ++ bch_nvmpg_init_free_space(ns); + + kfree(path); + return ns; + ++free_recs_bitmap: ++ bitmap_free(ns->recs_bitmap); ++free_pages_bitmap: ++ kvfree(ns->pages_bitmap); ++clear_ns_nr: ++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL; + free_ns: + fs_put_dax(ns->dax_dev); + kfree(ns); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 698c890b2d15..55778d4db7da 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -11,6 +11,8 @@ + * Bcache NVDIMM in memory data structures + */ + ++#define BCH_MAX_ORDER 20 ++ + /* + * The following three structures in memory records which page(s) allocated + * to which owner. After reboot from power failure, they will be initialized +@@ -28,6 +30,11 @@ struct bch_nvmpg_ns { + unsigned long pages_total; + pfn_t start_pfn; + ++ unsigned long *pages_bitmap; ++ struct list_head free_area[BCH_MAX_ORDER]; ++ ++ unsigned long *recs_bitmap; ++ + struct dax_device *dax_dev; + struct block_device *bdev; + struct bch_nvmpg_set *set; +@@ -69,6 +76,11 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_MAX_PGALLOC_RECS \ ++ (min_t(unsigned int, 64, \ ++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ ++ sizeof(struct bch_nvmpg_recs))) ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset); + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch new file mode 100644 index 0000000..94dc417 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch @@ -0,0 +1,308 @@ +From badd2b9151913efdc34e68b532ca0e6360d5ba1b Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Wed, 4 Aug 2021 22:41:20 +0800 +Subject: [PATCH 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy + +This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy +allocator. In terms of function, this func is like current +page-buddy-alloc. But the differences are: +a: it need owner_uuid as parameter which record owner info. And it +make those info persistence. +b: it don't need flags like GFP_*. All allocs are the equal. +c: it don't trigger other ops etc swap/recycle. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 9 ++ + 2 files changed, 230 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 80e12e06f6d3..ca8ffcec9b2c 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset) + return NULL; + } + ++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset) ++{ ++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT; ++} ++ + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + { + int ns_id = ns->ns_id; +@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) + return ns->base_addr + (pgoff << PAGE_SHIFT); + } + ++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = pgoff << PAGE_SHIFT; ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ + static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + { + struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; +@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + } + } + ++ ++/* If not found, it will create if create == true */ ++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create) ++{ ++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header; ++ struct bch_nvmpg_head *head = NULL; ++ int i; ++ ++ if (set_header == NULL) ++ goto out; ++ ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *h = &set_header->heads[i]; ++ ++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC) ++ continue; ++ ++ if (!memcmp(uuid, h->uuid, 16)) { ++ head = h; ++ break; ++ } ++ } ++ ++ if (!head && create) { ++ u32 used = set_header->used; ++ ++ if (set_header->size > used) { ++ head = &set_header->heads[used]; ++ memset(head, 0, sizeof(struct bch_nvmpg_head)); ++ head->state = BCH_NVMPG_HD_STAT_ALLOC; ++ memcpy(head->uuid, uuid, 16); ++ global_nvmpg_set->heads_used++; ++ set_header->used++; ++ } else ++ pr_info("No free bch_nvmpg_head\n"); ++ } ++ ++out: ++ return head; ++} ++ ++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void) ++{ ++ unsigned int start; ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0]; ++ struct bch_nvmpg_recs *recs; ++ ++ start = bitmap_find_next_zero_area(ns->recs_bitmap, ++ BCH_MAX_PGALLOC_RECS, 0, 1, 0); ++ if (start > BCH_MAX_PGALLOC_RECS) { ++ pr_info("No free struct bch_nvmpg_recs\n"); ++ return NULL; ++ } ++ ++ bitmap_set(ns->recs_bitmap, start, 1); ++ recs = (struct bch_nvmpg_recs *) ++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET) ++ + start; ++ ++ memset(recs, 0, sizeof(struct bch_nvmpg_recs)); ++ return recs; ++} ++ ++ ++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_head *head, ++ bool create) ++{ ++ int ns_id = ns->sb->this_ns; ++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]); ++ ++ /* If create=false, we return recs[nr] */ ++ if (!create) ++ return recs; ++ ++ /* ++ * If create=true, it mean we need a empty struct bch_nvmpg_rec ++ * So we should find non-empty struct bch_nvmpg_recs or alloc ++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs ++ */ ++ while (recs && (recs->used == recs->size)) { ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ ++ /* Found empty struct bch_nvmpg_recs */ ++ if (recs) ++ return recs; ++ ++ /* Need alloc new struct bch_nvmpg_recs */ ++ recs = find_empty_nvmpg_recs(); ++ if (recs) { ++ unsigned long offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head); ++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16); ++ memcpy(recs->uuid, head->uuid, 16); ++ recs->size = BCH_NVMPG_MAX_RECS; ++ recs->used = 0; ++ ++ offset = bch_nvmpg_ptr_to_offset(ns, recs); ++ if (prev_recs) ++ prev_recs->next_offset = offset; ++ else ++ head->recs_offset[ns_id] = offset; ++ } ++ ++ return recs; ++} ++ ++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_recs *recs, ++ unsigned long nvmpg_offset, ++ int order) ++{ ++ int i, ns_id; ++ unsigned long pgoff; ++ ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ns_id = ns->sb->this_ns; ++ ++ for (i = 0; i < recs->size; i++) { ++ if (recs->recs[i].pgoff == 0) { ++ recs->recs[i].pgoff = pgoff; ++ recs->recs[i].order = order; ++ recs->recs[i].ns_id = ns_id; ++ recs->used++; ++ break; ++ } ++ } ++ BUG_ON(i == recs->size); ++} ++ ++ ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ unsigned long nvmpg_offset = 0; ++ struct bch_nvmpg_head *head; ++ int n, o; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ head = find_nvmpg_head(uuid, true); ++ ++ if (!head) { ++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n"); ++ goto unlock; ++ } ++ ++ for (n = 0; n < global_nvmpg_set->total_ns; n++) { ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n]; ++ ++ if (!ns || (ns->free < (1L << order))) ++ continue; ++ ++ for (o = order; o < BCH_MAX_ORDER; o++) { ++ struct list_head *list; ++ struct page *page, *buddy_page; ++ ++ if (list_empty(&ns->free_area[o])) ++ continue; ++ ++ list = ns->free_area[o].next; ++ page = container_of((void *)list, struct page, ++ zone_device_data); ++ ++ list_del(list); ++ ++ while (o != order) { ++ void *addr; ++ pgoff_t pgoff; ++ ++ pgoff = page->index + (1L << (o - 1)); ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(buddy_page, o - 1); ++ buddy_page->index = pgoff; ++ __SetPageBuddy(buddy_page); ++ list_add((struct list_head *)&buddy_page->zone_device_data, ++ &ns->free_area[o - 1]); ++ o--; ++ } ++ ++ set_page_private(page, order); ++ __ClearPageBuddy(page); ++ ns->free -= 1L << order; ++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index); ++ break; ++ } ++ ++ if (o < BCH_MAX_ORDER) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = find_nvmpg_recs(ns, head, true); ++ /* ToDo: handle pgalloc_recs==NULL */ ++ add_nvmpg_rec(ns, recs, nvmpg_offset, order); ++ break; ++ } ++ } ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return nvmpg_offset; ++} ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 55778d4db7da..d03f3241b45a 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -76,6 +76,9 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_PGOFF_TO_KVADDR(pgoff) \ ++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT)) ++ + #define BCH_MAX_PGALLOC_RECS \ + (min_t(unsigned int, 64, \ + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ +@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + + #else + +@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void) + + static inline void bch_nvmpg_exit(void) { } + ++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ return 0; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch new file mode 100644 index 0000000..4ac1234 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch @@ -0,0 +1,251 @@ +From 7eac3b1797acdd2ff3c684c9fabd7fe12bd671c6 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:06:35 +0800 +Subject: [PATCH 05/12] bcache: bch_nvmpg_free_pages() of the buddy allocator + +This patch implements the bch_nvmpg_free_pages() of the buddy allocator. + +The difference between this and page-buddy-free: +it need owner_uuid to free owner allocated pages, and must +persistent after free. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++-- + drivers/md/bcache/nvmpg.h | 3 + + 2 files changed, 160 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index ca8ffcec9b2c..9864436a45cc 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + return rc; + } + ++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset, ++ int order) ++{ ++ unsigned long add_pages = (1L << order); ++ pgoff_t pgoff; ++ struct page *page; ++ void *va; ++ ++ if (nvmpg_offset == 0) { ++ pr_err("free pages on offset 0\n"); ++ return; ++ } ++ ++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset)); ++ WARN_ON((!page) || (page->private != order)); ++ pgoff = page->index; ++ ++ while (order < BCH_MAX_ORDER - 1) { ++ struct page *buddy_page; ++ ++ pgoff_t buddy_pgoff = pgoff ^ (1L << order); ++ pgoff_t parent_pgoff = pgoff & ~(1L << order); ++ ++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total)) ++ break; ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!buddy_page); ++ ++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) { ++ list_del((struct list_head *)&buddy_page->zone_device_data); ++ __ClearPageBuddy(buddy_page); ++ pgoff = parent_pgoff; ++ order++; ++ continue; ++ } ++ break; ++ } ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[order]); ++ page->index = pgoff; ++ set_page_private(page, order); ++ __SetPageBuddy(page); ++ ns->free += add_pages; ++} ++ + static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + { + unsigned int start, end, pages; +@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + pages = end - start; + + while (pages) { +- void *addr; +- + for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { + if ((pgoff_start % (1L << i) == 0) && + (pages >= (1L << i))) + break; + } + +- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); +- page = bch_nvmpg_va_to_pg(addr); ++ page = bch_nvmpg_va_to_pg( ++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start)); + set_page_private(page, i); + page->index = pgoff_start; +- __SetPageBuddy(page); +- list_add((struct list_head *)&page->zone_device_data, +- &ns->free_area[i]); ++ ++ /* In order to update ns->free */ ++ __free_space(ns, pgoff_start, i); + + pgoff_start += 1L << i; + pages -= 1L << i; +@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return nvmpg_offset; + } + ++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) ++{ ++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT); ++} ++ ++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns, ++ void *start_addr, void *end_addr) ++{ ++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns)); ++} ++ ++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id, ++ unsigned long nvmpg_offset, int order) ++{ ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *prev_recs, *sys_recs; ++ struct bch_nvmpg_ns *ns; ++ unsigned long pgoff; ++ int i; ++ ++ ns = global_nvmpg_set->ns_tbl[0]; ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ++ head = bch_nvmpg_offset_to_ptr(recs->head_offset); ++ prev_recs = recs; ++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET); ++ while (recs) { ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *rec = &(recs->recs[i]); ++ ++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) { ++ WARN_ON(rec->order != order); ++ rec->_v = 0; ++ recs->used--; ++ ++ if (recs->used == 0) { ++ int recs_pos = recs - sys_recs; ++ ++ if (recs == prev_recs) ++ head->recs_offset[ns_id] = ++ recs->next_offset; ++ else ++ prev_recs->next_offset = ++ recs->next_offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = 0; ++ ++ bitmap_clear(ns->recs_bitmap, recs_pos, 1); ++ } ++ goto out; ++ } ++ } ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++out: ++ return (recs ? 0 : -ENOENT); ++} ++ ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, ++ const char *uuid) ++{ ++ struct bch_nvmpg_ns *ns; ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *recs; ++ int r; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)]; ++ if (!ns) { ++ pr_err("can't find namespace by given kaddr from namespace\n"); ++ goto unlock; ++ } ++ ++ head = find_nvmpg_head(uuid, false); ++ if (!head) { ++ pr_err("can't found bch_nvmpg_head by uuid\n"); ++ goto unlock; ++ } ++ ++ recs = find_nvmpg_recs(ns, head, false); ++ if (!recs) { ++ pr_err("can't find bch_nvmpg_recs by uuid\n"); ++ goto unlock; ++ } ++ ++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order); ++ if (r < 0) { ++ pr_err("can't find bch_nvmpg_rec\n"); ++ goto unlock; ++ } ++ ++ __free_space(ns, nvmpg_offset, order); ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++} ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + ns->pages_offset = sb->pages_offset; + ns->pages_total = sb->pages_total; + ns->sb = sb; ++ /* increase by __free_space() */ + ns->free = 0; + ns->bdev = bdev; + ns->set = global_nvmpg_set; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index d03f3241b45a..e089936e7f13 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + + #else + +@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return 0; + } + ++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch new file mode 100644 index 0000000..0a77f35 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0006-bcache-get-recs-list-head-for-allocated-pages-by-spe.patch @@ -0,0 +1,66 @@ +From 3440789a920beb6e63493eecde279b6902ac0a1a Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 21:06:03 +0800 +Subject: [PATCH 06/12] bcache: get recs list head for allocated pages by + specific uuid + +This patch implements bch_get_nvmpg_head() of the buddy allocator +to be used to get recs list head for allocated pages by specific +uuid. Then the requester (owner) can find all previous allocated +nvdimm pages by iterating the recs list. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 5 +++++ + drivers/md/bcache/nvmpg.h | 6 ++++++ + 2 files changed, 11 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 9864436a45cc..3c50cb09bb7a 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return nvmpg_offset; + } + ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return find_nvmpg_head(uuid, false); ++} ++ + static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) + { + return ns->base_addr + (ns->pages_total << PAGE_SHIFT); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index e089936e7f13..2361cabf18be 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -94,6 +94,7 @@ int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + + #else + +@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + + static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } + ++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch new file mode 100644 index 0000000..f2880af --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-.patch @@ -0,0 +1,48 @@ +From 80d34e8aba0591ad58f1c3336333b48c715e3a69 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:17:02 +0800 +Subject: [PATCH 07/12] bcache: use bucket index to set GC_MARK_METADATA for + journal buckets in bch_btree_gc_finish() + +Currently the meta data bucket locations on cache device are reserved +after the meta data stored on NVDIMM pages, for the meta data layout +consistentcy temporarily. So these buckets are still marked as meta data +by SET_GC_MARK() in bch_btree_gc_finish(). + +When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear +address of NVDIMM pages and not bucket index anymore. Therefore we +should avoid to find bucket index from sb.d[], and directly use bucket +index from ca->sb.first_bucket to (ca->sb.first_bucket + +ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/btree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 88c573eeb598..1a0ff117373f 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c) + ca = c->cache; + ca->invalidate_needs_gc = 0; + +- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) +- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); ++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */ ++ for (i = ca->sb.first_bucket; ++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++) ++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch new file mode 100644 index 0000000..30de10c --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-inc.patch @@ -0,0 +1,60 @@ +From c006ab9655e4834a858bb399e1bcd8a51668d79c Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:18:31 +0800 +Subject: [PATCH 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into + incompat feature set + +This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the +incompat feature set. When this bit is set by bcache-tools, it indicates +bcache meta data should be stored on specific NVDIMM meta device. + +The bcache meta data mainly includes journal and btree nodes, when this +bit is set in incompat feature set, bcache will ask the nvm-pages +allocator for NVDIMM space to store the meta data. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/features.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h +index 09161b89c63e..fab92678be76 100644 +--- a/drivers/md/bcache/features.h ++++ b/drivers/md/bcache/features.h +@@ -18,11 +18,19 @@ + #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 + /* real bucket size is (1 << bucket_size) */ + #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 ++/* store bcache meta data on nvdimm */ ++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004 + + #define BCH_FEATURE_COMPAT_SUPP 0 + #define BCH_FEATURE_RO_COMPAT_SUPP 0 ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ ++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \ ++ BCH_FEATURE_INCOMPAT_NVDIMM_META) ++#else + #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) ++#endif + + #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ + + BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); + BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); ++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META); + + static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) + { +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch new file mode 100644 index 0000000..a56c25c --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0009-bcache-initialize-bcache-journal-for-NVDIMM-meta-dev.patch @@ -0,0 +1,255 @@ +From 09fdf9edf79edd718035e6d9afa75f80f1d3a330 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 21 Oct 2021 21:39:18 +0800 +Subject: [PATCH 09/12] bcache: initialize bcache journal for NVDIMM meta + device + +The nvm-pages allocator may store and index the NVDIMM pages allocated +for bcache journal. This patch adds the initialization to store bcache +journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is +set by bcache-tools. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space() +will return the nvmpg_offset of NVDIMM pages for bcache journal, +- If there is previously allocated space, find it from nvm-pages owner + list and return to bch_journal_init(). +- If there is no previously allocated space, require a new NVDIMM range + from the nvm-pages allocator, and return it to bch_journal_init(). + +And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg +offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to +iterate all journal buckets. + +Later when bcache journaling code stores the journaling jset, the target +NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used +to calculate the linear address in memory copy from DRAM pages into +NVDIMM pages. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/journal.h | 2 +- + drivers/md/bcache/nvmpg.c | 9 +++ + drivers/md/bcache/nvmpg.h | 1 + + drivers/md/bcache/super.c | 18 +++--- + 5 files changed, 132 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 61bd79babf7a..d887557c718e 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -9,6 +9,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "nvmpg.h" ++#include "features.h" + + #include <trace/events/bcache.h> + +@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c) + + return 0; + } ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head, ++ struct cache *ca) ++{ ++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id; ++ unsigned long ret_offset = 0; ++ int i; ++ ++ jnl_offset = (unsigned long)ca->sb.d[0]; ++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset); ++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ struct bch_nvmpg_recs *recs; ++ struct bch_nvmpg_rec *rec; ++ unsigned long recs_offset = 0; ++ int j; ++ ++ recs_offset = nvmpg_head->recs_offset[i]; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ while (recs) { ++ for (j = 0; j < recs->size; j++) { ++ rec = &recs->recs[j]; ++ if ((rec->pgoff != jnl_pgoff) || ++ (rec->ns_id != jnl_ns_id)) ++ continue; ++ ++ ret_offset = jnl_offset; ++ goto out; ++ } ++ recs_offset = recs->next_offset; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ } ++ } ++ ++out: ++ return ret_offset; ++} ++ ++static unsigned long get_journal_nvmpg_space(struct cache *ca) ++{ ++ struct bch_nvmpg_head *head = NULL; ++ unsigned long nvmpg_offset; ++ int order; ++ ++ head = bch_get_nvmpg_head(ca->sb.set_uuid); ++ if (head) { ++ nvmpg_offset = find_journal_nvmpg_base(head, ca); ++ if (nvmpg_offset) ++ goto found; ++ } ++ ++ order = ilog2((ca->sb.bucket_size * ++ ca->sb.njournal_buckets) / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid); ++ if (nvmpg_offset) ++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), ++ 0, (1 << order) * PAGE_SIZE); ++found: ++ return nvmpg_offset; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static int __bch_journal_nvdimm_init(struct cache *ca) ++{ ++ int ret = -1; ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ int i; ++ unsigned long jnl_base = 0; ++ ++ jnl_base = get_journal_nvmpg_space(ca); ++ if (!jnl_base) { ++ pr_err("Failed to get journal space from nvdimm\n"); ++ goto out; ++ } ++ ++ /* Iniialized and reloaded from on-disk super block already */ ++ if (ca->sb.d[0] != 0) ++ goto out; ++ ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i); ++ ++ ret = 0; ++out: ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++ return ret; ++} ++ ++ ++int bch_journal_init(struct cache_set *c) ++{ ++ int i, ret = 0; ++ struct cache *ca = c->cache; ++ ++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, ++ 2, SB_JOURNAL_BUCKETS); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) { ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = ca->sb.first_bucket + i; ++ } else ++ ret = __bch_journal_nvdimm_init(ca); ++ ++ return ret; ++} +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..e3a7fa5a8fda 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list); + void bch_journal_meta(struct cache_set *c, struct closure *cl); + int bch_journal_read(struct cache_set *c, struct list_head *list); + int bch_journal_replay(struct cache_set *c, struct list_head *list); +- ++int bch_journal_init(struct cache_set *c); + void bch_journal_free(struct cache_set *c); + int bch_journal_alloc(struct cache_set *c); + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 3c50cb09bb7a..2d0808a83f86 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -24,6 +24,15 @@ + + struct bch_nvmpg_set *global_nvmpg_set; + ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX)) ++ return global_nvmpg_set->ns_tbl[ns_id]; ++ ++ pr_emerg("Invalid ns_id: %d\n", ns_id); ++ return NULL; ++} ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset) + { + int ns_id = BCH_NVMPG_GET_NS_ID(offset); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 2361cabf18be..f7b7177cced3 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); + + #else + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 74d51a0b806f..a27fa65d8832 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * + goto err; + + err = "Journal buckets not sequential"; +- for (i = 0; i < sb->keys; i++) +- if (sb->d[i] != sb->first_bucket + i) +- goto err; ++ if (!bch_has_feature_nvdimm_meta(sb)) { ++ for (i = 0; i < sb->keys; i++) ++ if (sb->d[i] != sb->first_bucket + i) ++ goto err; ++ } + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) +@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c) + if (bch_journal_replay(c, &journal)) + goto err; + } else { +- unsigned int j; +- + pr_notice("invalidating existing data\n"); +- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, +- 2, SB_JOURNAL_BUCKETS); +- +- for (j = 0; j < ca->sb.keys; j++) +- ca->sb.d[j] = ca->sb.first_bucket + j; ++ err = "error initializing journal"; ++ if (bch_journal_init(c)) ++ goto err; + + bch_initial_gc_finish(c); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch new file mode 100644 index 0000000..99e53f3 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0010-bcache-support-storing-bcache-journal-into-NVDIMM-me.patch @@ -0,0 +1,231 @@ +From ab08690b14942f881d545539e83762a6fa794131 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:45:23 +0800 +Subject: [PATCH 10/12] bcache: support storing bcache journal into NVDIMM meta + device + +This patch implements two methods to store bcache journal to, +1) __journal_write_unlocked() for block interface device + The latency method to compose bio and issue the jset bio to cache + device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache + device to store the journal jset. +2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM + Use memory interface to access NVDIMM pages and store the jset by + memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear + address from the NVDIMM pages to store the journal jset. + +For legacy configuration without NVDIMM meta device, journal I/O is +handled by __journal_write_unlocked() with existing code logic. If the +NVDIMM meta device is used (by bcache-tools), the journal I/O will +be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM +pages. + +And when NVDIMM meta device is used, sb.d[] stores the linear addresses +from NVDIMM pages (no more bucket index), in journal_reclaim() the +journaling location in c->journal.key.ptr[0] should also be updated by +linear address from NVDIMM pages (no more LBA combined by sectors offset +and bucket index). + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++----------- + drivers/md/bcache/super.c | 3 +- + 2 files changed, 85 insertions(+), 38 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index d887557c718e..7d5c5ed18890 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca) + return; + } + ++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb)); ++ + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; +@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c) + goto out; + + ja->cur_idx = next; +- k->ptr[0] = MAKE_PTR(0, +- bucket_to_sector(c, ca->sb.d[ja->cur_idx]), +- ca->sb.nr_this_dev); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ k->ptr[0] = MAKE_PTR(0, ++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ++ ca->sb.nr_this_dev); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr( ++ ca->sb.d[ja->cur_idx]); ++#endif ++ + atomic_long_inc(&c->reclaimed_journal_buckets); + + bkey_init(k); +@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + +-static void journal_write_unlocked(struct closure *cl) ++ ++static void __journal_write_unlocked(struct cache_set *c) + __releases(c->journal.lock) + { +- struct cache_set *c = container_of(cl, struct cache_set, journal.io); +- struct cache *ca = c->cache; +- struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; +- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * +- ca->sb.block_size; +- ++ struct journal_write *w = c->journal.cur; ++ struct closure *cl = &c->journal.io; ++ struct cache *ca = c->cache; + struct bio *bio; + struct bio_list list; ++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * ++ ca->sb.block_size; + + bio_list_init(&list); + +- if (!w->need_write) { +- closure_return_with_destructor(cl, journal_write_unlock); +- return; +- } else if (journal_full(&c->journal)) { +- journal_reclaim(c); +- spin_unlock(&c->journal.lock); +- +- btree_flush_write(c); +- continue_at(cl, journal_write, bch_journal_wq); +- return; +- } +- +- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); +- +- w->data->btree_level = c->root->level; +- +- bkey_copy(&w->data->btree_root, &c->root->key); +- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); +- +- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; +- w->data->magic = jset_magic(&ca->sb); +- w->data->version = BCACHE_JSET_VERSION; +- w->data->last_seq = last_seq(&c->journal); +- w->data->csum = csum_set(w->data); +- + for (i = 0; i < KEY_PTRS(k); i++) { + ca = c->cache; + bio = &ca->journal.bio; +@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl) + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } +- + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + +@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl) + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(c, bio, cl); ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static void __journal_nvdimm_write_unlocked(struct cache_set *c) ++ __releases(c->journal.lock) ++{ ++ struct journal_write *w = c->journal.cur; ++ struct cache *ca = c->cache; ++ unsigned int sectors; ++ ++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size; ++ atomic_long_add(sectors, &ca->meta_sectors_written); ++ ++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9); ++ ++ c->journal.key.ptr[0] += sectors << 9; ++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq; ++ ++ atomic_dec_bug(&fifo_back(&c->journal.pin)); ++ bch_journal_next(&c->journal); ++ journal_reclaim(c); ++ ++ spin_unlock(&c->journal.lock); ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static void journal_write_unlocked(struct closure *cl) ++{ ++ struct cache_set *c = container_of(cl, struct cache_set, journal.io); ++ struct cache *ca = c->cache; ++ struct journal_write *w = c->journal.cur; ++ ++ if (!w->need_write) { ++ closure_return_with_destructor(cl, journal_write_unlock); ++ return; ++ } else if (journal_full(&c->journal)) { ++ journal_reclaim(c); ++ spin_unlock(&c->journal.lock); ++ ++ btree_flush_write(c); ++ continue_at(cl, journal_write, bch_journal_wq); ++ return; ++ } ++ ++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); ++ ++ w->data->btree_level = c->root->level; ++ ++ bkey_copy(&w->data->btree_root, &c->root->key); ++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); ++ ++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; ++ w->data->magic = jset_magic(&ca->sb); ++ w->data->version = BCACHE_JSET_VERSION; ++ w->data->last_seq = last_seq(&c->journal); ++ w->data->csum = csum_set(w->data); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ __journal_write_unlocked(c); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ __journal_nvdimm_write_unlocked(c); ++#endif + + continue_at(cl, journal_write_done, NULL); + } +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index a27fa65d8832..45b69ddc9cfa 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj) + static void cache_set_free(struct closure *cl) + { + struct cache_set *c = container_of(cl, struct cache_set, cl); +- struct cache *ca; ++ struct cache *ca = c->cache; + + debugfs_remove(c->debug); + +@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl) + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + +- ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch new file mode 100644 index 0000000..77a4ae4 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0011-bcache-read-jset-from-NVDIMM-pages-for-journal-repla.patch @@ -0,0 +1,181 @@ +From 5b9accf31b16f6cc138754d8e77982092094a4ee Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:54:12 +0800 +Subject: [PATCH 11/12] bcache: read jset from NVDIMM pages for journal replay + +This patch implements two methods to read jset from media for journal +replay, +- __jnl_rd_bkt() for block device + This is the legacy method to read jset via block device interface. +- __jnl_rd_nvm_bkt() for NVDIMM + This is the method to read jset from NVDIMM memory interface, a.k.a + memcopy() from NVDIMM pages to DRAM pages. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set, +during running cache set, journal_read_bucket() will read the journal +content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of +NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which +were initialized and maintained in previous runs of the cache set. + +A thing should be noticed is, when bch_journal_read() is called, the +linear address of NVDIMM pages is not loaded and initialized yet, it +is necessary to call __bch_journal_nvdimm_init() before reading the jset +from NVDIMM pages. + +The code comments added in journal_read_bucket() is noticed by kernel +test robot and Dan Carpenter, it explains why it is safe to only check +!bch_has_feature_nvdimm_meta() condition in the if() statement when +CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the +bogus warning message from static checking tool. + +Signed-off-by: Coly Li <colyli@suse.de> +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++------- + 1 file changed, 71 insertions(+), 17 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7d5c5ed18890..902992be9191 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio) + closure_put(cl); + } + ++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset, ++ struct closure *cl) ++{ ++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]); ++ struct bio *bio = &ca->journal.bio; ++ struct jset *data = ca->set->journal.w[0].data; ++ ++ bio_reset(bio); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio_set_dev(bio, ca->bdev); ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = cl; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(ca->set, bio, cl); ++ closure_sync(cl); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset) ++{ ++ void *jset_addr; ++ struct jset *data; ++ ++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9); ++ data = ca->set->journal.w[0].data; ++ ++ memcpy(data, jset_addr, len << 9); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ + static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned int bucket_index) + { + struct journal_device *ja = &ca->journal; +- struct bio *bio = &ja->bio; + + struct journal_replay *i; +- struct jset *j, *data = ca->set->journal.w[0].data; ++ struct jset *j; + struct closure cl; + unsigned int len, left, offset = 0; + int ret = 0; +- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + +@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, + reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); + +- bio_reset(bio); +- bio->bi_iter.bi_sector = bucket + offset; +- bio_set_dev(bio, ca->bdev); +- bio->bi_iter.bi_size = len << 9; +- +- bio->bi_end_io = journal_read_endio; +- bio->bi_private = &cl; +- bio_set_op_attrs(bio, REQ_OP_READ, 0); +- bch_bio_map(bio, data); +- +- closure_bio_submit(ca->set, bio, &cl); +- closure_sync(&cl); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl); ++ /* ++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible ++ * support feature set, a cache device format with feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in ++ * read_super() by bch_has_unknown_incompat_features(). ++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is ++ * safe to ignore the bch_has_feature_nvdimm_meta() condition. ++ */ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset); ++#endif + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ +- +- j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); +@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int __bch_journal_nvdimm_init(struct cache *ca); ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + unsigned int i, l, r, m; + uint64_t seq; + ++ /* ++ * Linear addresses of NVDIMM pages for journaling is not ++ * initialized yet, do it before read jset from NVDIMM pages. ++ */ ++ if (bch_has_feature_nvdimm_meta(&ca->sb)) { ++ if (__bch_journal_nvdimm_init(ca) < 0) ++ return -ENXIO; ++ } ++ + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch new file mode 100644 index 0000000..0ffc9a7 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/old/0012-bcache-add-sysfs-interface-register_nvdimm_meta-to-r.patch @@ -0,0 +1,84 @@ +From 55b8876f5fc3a3f097bca7f2b518e0dccd112905 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:55:25 +0800 +Subject: [PATCH 12/12] bcache: add sysfs interface register_nvdimm_meta to + register NVDIMM meta device + +This patch adds a sysfs interface register_nvdimm_meta to register +NVDIMM meta device. The sysfs interface file only shows up when +CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by +bcache-tools can be registered into bcache by e.g., + echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 45b69ddc9cfa..2b9cde44879b 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); ++#endif + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); + kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta); ++#endif + + static bool bch_is_open_backing(dev_t dev) + { +@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args) + queue_delayed_work(system_wq, &args->reg_work, 10); + } + ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr, ++ const char *buffer, size_t size) ++{ ++ ssize_t ret = size; ++ ++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer); ++ ++ if (IS_ERR(ns)) { ++ pr_err("register nvdimm namespace %s for meta device failed.\n", ++ buffer); ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++#endif ++ + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) + { +@@ -2864,6 +2890,9 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ &ksysfs_register_nvdimm_meta.attr, ++#endif + &ksysfs_pendings_cleanup.attr, + NULL + }; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch new file mode 100644 index 0000000..fa696e6 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0000-cover-letter.patch @@ -0,0 +1,125 @@ +From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 12 Dec 2021 23:13:09 +0800 +Subject: [PATCH v13 00/12] bcache for 5.17: enable NVDIMM for bcache journal + +Hi Jens, + +This is the v12 effort the enabling NVDIMM for bcache journal, the code +is under testing for months and quite stable now. Please consider to +take them for Linux v5.17 merge window. + +All current code logic and on-media format are consistent with previous +v12 series. The major difference from v12 series include, +- more typos in code comments and commit logs are fixed. +- add kernel message to indicate only first range is used currently if + the NVDIMM namespace has multiple mapping ranges. +- not export nvm-pages allocator APIs, it is unnecessary since currently + only bcache uses them. + +Now all previous bcache related UAPI headers are all moved into bcache +private code directory, there is no global headers exported to neither +kernel or user source code. + +Bcache uses nvm-pages allocator to allocate pages from NVDIMM namespace +for its journaling space. The nvm-pages allocator is a buddy-like +allocator, which allocates size in power-of-2 pages from the NVDIMM +namespace. User space tool 'bcache' has a new added '-M' option to +format a NVDIMM namespace and register it via sysfs interface as a +bcache meta device. The nvm-pages allocator code does a DAX mapping to +map the whole namespace into system's memory address range, and allocate +the pages to requestion like typical buddy allocator does. The major +difference is nvm-pages allocator maintains the pages allocated to each +requester by an allocation list which stored on NVDIMM too. Allocation +list of different requester is tracked by a pre-defined UUID, all the +pages tracked in all allocation lists are treated as allocated busy +pages and won't be initialized into buddy system after the system +reboots. + +The bcache journal code may request a block of power-of-2 size pages +from the nvm-pages allocator, normally it is a range of 256MB or 512MB +continuous pages range. During meta data journaling, the in-memory jsets +go into the calculated nvdimm pages location by kernel memcpy routine. +So the journaling I/Os won't go into block device (e.g. SSD) anymore, +the write and read for journal jsets happen on NVDIMM. + +Intel developers Jianpeng Ma and Qiaowei Ren compose the initial code of +nvm-pages allocator, the related patches are, +- bcache: initialize the nvm-pages allocator +- bcache: initialization of the buddy +- bcache: bch_nvm_alloc_pages() of the buddy +- bcache: bch_nvm_free_pages() of the buddy +- bcache: get recs list head for allocated pages by specific uuid +All the code depends on Linux libnvdimm and dax drivers, the bcache nvm- +pages allocator can be treated as user of these two drivers. + +I modify the bcache code to recognize the nvm meta device feature, +initialize journal on NVDIMM, and do journal I/Os on NVDIMM in the +following patches, +- bcache: add initial data structures for nvm pages +- bcache: use bucket index to set GC_MARK_METADATA for journal buckets + in bch_btree_gc_finish() +- bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set +- bcache: initialize bcache journal for NVDIMM meta device +- bcache: support storing bcache journal into NVDIMM meta device +- bcache: read jset from NVDIMM pages for journal replay +- bcache: add sysfs interface register_nvdimm_meta to register NVDIMM + meta device + +All the code is EXPERIMENTAL, they won't be enabled by default until we +feel the NVDIMM support is completed and stable. The current code has +been tested internally for monthes, we don't observe any issue during +all tests with or without enabling the configuration. + +Please consider to pick this series for Linux v5.17 merge window. If +there is any issue detected, we will response in time and fix them ASAP. + +Thank you in advance. + +Coly Li + +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Ying Huang <ying.huang@intel.com> +--- + +Coly Li (7): + bcache: add initial data structures for nvm pages + bcache: use bucket index to set GC_MARK_METADATA for journal buckets + in bch_btree_gc_finish() + bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into incompat feature set + bcache: initialize bcache journal for NVDIMM meta device + bcache: support storing bcache journal into NVDIMM meta device + bcache: read jset from NVDIMM pages for journal replay + bcache: add sysfs interface register_nvdimm_meta to register NVDIMM + meta device + +Jianpeng Ma (5): + bcache: initialize the nvm pages allocator + bcache: initialization of the buddy + bcache: bch_nvmpg_alloc_pages() of the buddy + bcache: bch_nvmpg_free_pages() of the buddy allocator + bcache: get recs list head for allocated pages by specific uuid + + drivers/md/bcache/Kconfig | 10 + + drivers/md/bcache/Makefile | 1 + + drivers/md/bcache/btree.c | 6 +- + drivers/md/bcache/features.h | 9 + + drivers/md/bcache/journal.c | 321 +++++++++-- + drivers/md/bcache/journal.h | 2 +- + drivers/md/bcache/nvmpg.c | 931 +++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 128 +++++ + drivers/md/bcache/nvmpg_format.h | 253 +++++++++ + drivers/md/bcache/super.c | 53 +- + 10 files changed, 1646 insertions(+), 68 deletions(-) + create mode 100644 drivers/md/bcache/nvmpg.c + create mode 100644 drivers/md/bcache/nvmpg.h + create mode 100644 drivers/md/bcache/nvmpg_format.h + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch new file mode 100644 index 0000000..14b3695 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0001-bcache-add-initial-data-structures-for-nvm-pages.patch @@ -0,0 +1,343 @@ +From 0ecd02239e1e7fc12115fda644810ee88bf26dff Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 26 Jul 2021 00:26:28 +0800 +Subject: [PATCH v13 01/12] bcache: add initial data structures for nvm pages + +This patch initializes the prototype data structures for nvm pages +allocator, + +- struct bch_nvmpg_sb + This is the super block allocated on each nvdimm namespace for the nvm +pages allocator. A nvdimm pages allocator set may have multiple name- +spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this +namespace belongs to. + +- struct bch_nvmpg_header + This is a table for all heads of all allocation record lists. An allo- +cation record list traces all page(s) allocated from nvdimm namespace(s) +to a specific requester (identified by uuid). After system reboot, a +requester can retrieve all previously allocated nvdimm pages from its +record list by a pre-defined uuid. + +- struct bch_nvmpg_head + This is a head of an allocation record list. Each nvdimm pages +requester (typically it's a driver) has and only has one allocation +record list, and an allocated nvdimm page only belongs to a specific +allocation record list. Member uuid[] will be set as the requester's +uuid, e.g. for bcache it is the cache set uuid. Member label is not +mandatory, it is a human-readable string for debug purpose. The nvm +offset format pointers recs_offset[] point to the location of actual +allocator record lists on each namespace of the nvdimm pages allocator +set. Each per namespace record list is represented by the following +struct bch_nvmpg_recs. + +- struct bch_nvmpg_recs + This structure represents a requester's allocation record list. Member +uuid is same value as the uuid of its corresponding struct +bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec +objects to trace all allocated nvmdimm pages. If the table recs[] is +full, the nvmpg format offset is a pointer points to the next struct +bch_nvmpg_recs object, nvm pages allocator will look for available free +allocation record there. All the linked struct bch_nvmpg_recs objects +compose a requester's allocation record list which is headed by the +above struct bch_nvmpg_head. + +- struct bch_nvmpg_rec + This structure records a range of allocated nvdimm pages. Member pgoff +is offset in unit of page size of this allocation range. Member order +indicates size of the allocation range by (1 << order) in unit of page +size. Because the nvdimm pages allocator set may have multiple nvdimm +namespaces, member ns_id is used to identify which namespace the pgoff +belongs to. + - Bits 0 - 51: pgoff - is pages offset of the allocated pages. + - Bits 52 - 57: order - allocated size in page_size * order-of-2 + - Bits 58 - 60: ns_id - identify which namespace the pages stays on + - Bits 61 - 63: reserved. +Since each of the allocated nvm pages are power of 2, using 6 bits to +represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum +value. It can be a 76 bits width range size in byte for 4KB page size, +which is large enough currently. + +All the structure members having _offset suffix are in a special format. +E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset}, +bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset}, +the offset value is 64bit, the most significant 3 bits are used to +identify which namespace this offset belongs to, and the rested 61 bits +are actual offset inside the namespace. Following patches will have +helper routines to do the conversion between memory pointer and offset. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Ying Huang <ying.huang@intel.com> +--- + drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++ + 1 file changed, 253 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg_format.h + +diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h +new file mode 100644 +index 000000000000..414bcafa31ee +--- /dev/null ++++ b/drivers/md/bcache/nvmpg_format.h +@@ -0,0 +1,253 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ ++#ifndef _NVMPG_FORMAT_H ++#define _NVMPG_FORMAT_H ++ ++/* ++ * Bcache on NVDIMM data structures ++ */ ++ ++/* ++ * - struct bch_nvmpg_sb ++ * This is the super block allocated on each nvdimm namespace for the nvm ++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces, ++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space ++ * belongs to. ++ * ++ * - struct bch_nvmpg_header ++ * This is a table for all heads of all allocation record lists. An allo- ++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to ++ * a specific requester (identified by uuid). After system reboot, a requester ++ * can retrieve all previously allocated nvdimm pages from its record list by a ++ * pre-defined uuid. ++ * ++ * - struct bch_nvmpg_head ++ * This is a head of an allocation record list. Each nvdimm pages requester ++ * (typically it's a driver) has and only has one allocation record list, and ++ * an allocated nvdimm page only bedlones to a specific allocation record list. ++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the ++ * cache set uuid. Member label is not mandatory, it is a human-readable string ++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the ++ * location of actual allocator record lists on each name space of the nvdimm ++ * pages allocator set. Each per name space record list is represented by the ++ * following struct bch_nvmpg_recs. ++ * ++ * - struct bch_nvmpg_recs ++ * This structure represents a requester's allocation record list. Member uuid ++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member ++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated ++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a ++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator ++ * will look for available free allocation record there. All the linked ++ * struct bch_nvmpg_recs objects compose a requester's allocation record list ++ * which is headed by the above struct bch_nvmpg_head. ++ * ++ * - struct bch_nvmpg_rec ++ * This structure records a range of allocated nvdimm pages. Member pgoff is ++ * offset in unit of page size of this allocation range. Member order indicates ++ * size of the allocation range by (1 << order) in unit of page size. Because ++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member ++ * ns_id is used to identify which name space the pgoff belongs to. ++ * ++ * All allocation record lists are stored on the first initialized nvdimm name- ++ * space (ns_id 0). The meta data default layout of nvm pages allocator on ++ * namespace 0 is, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET ++ * | bch_nvmpg_header | ++ * | | ++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET ++ * | bch_nvmpg_recs | ++ * | (nvm pages internal usage) | ++ * 24KB +---------------------------------+ ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * ++ * Meta data default layout on rested nvdimm namespaces, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * - The nvmpg offset format pointer ++ * All member names ending with _offset in this header are nvmpg offset ++ * format pointer. The offset format is, ++ * [highest 3 bits: ns_id] ++ * [rested 61 bits: offset in No. ns_id namespace] ++ * ++ * The above offset is byte unit, the procedure to reference a nvmpg offset ++ * format pointer is, ++ * 1) Identify the namespace related in-memory structure by ns_id from the ++ * highest 3 bits of offset value. ++ * 2) Get the DAX mapping base address from the in-memory structure. ++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base ++ * address with offset value in rested low 61 bits. ++ * All related in-memory structure and conversion routines don't belong to ++ * user space api, they are defined by nvm-pages allocator code in ++ * drivers/md/bcache/nvm-pages.{c,h} ++ * ++ */ ++ ++#include <linux/types.h> ++ ++/* In sectors */ ++#define BCH_NVMPG_SB_OFFSET 4096 ++#define BCH_NVMPG_START (16 << 20) ++ ++#define BCH_NVMPG_LBL_SIZE 32 ++#define BCH_NVMPG_NS_MAX 8 ++ ++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10) ++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10) ++ ++#define BCH_NVMPG_SB_VERSION 0 ++#define BCH_NVMPG_SB_VERSION_MAX 0 ++ ++static const __u8 bch_nvmpg_magic[] = { ++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83, ++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 }; ++static const __u8 bch_nvmpg_recs_magic[] = { ++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9, ++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae }; ++ ++/* takes 64bit width */ ++struct bch_nvmpg_rec { ++ union { ++ struct { ++ __u64 pgoff:52; ++ __u64 order:6; ++ __u64 ns_id:3; ++ __u64 reserved:3; ++ }; ++ __u64 _v; ++ }; ++}; ++ ++struct bch_nvmpg_recs { ++ union { ++ struct { ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvmpg_head ++ */ ++ __u64 head_offset; ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvm_pgalloc_recs which contains ++ * the next recs[] array. ++ */ ++ __u64 next_offset; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_rec recs[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_RECS \ ++ ((sizeof(struct bch_nvmpg_recs) - \ ++ offsetof(struct bch_nvmpg_recs, recs)) / \ ++ sizeof(struct bch_nvmpg_rec)) ++ ++#define BCH_NVMPG_HD_STAT_FREE 0x0 ++#define BCH_NVMPG_HD_STAT_ALLOC 0x1 ++struct bch_nvmpg_head { ++ __u8 uuid[16]; ++ __u8 label[BCH_NVMPG_LBL_SIZE]; ++ __u32 state; ++ __u32 flags; ++ /* ++ * Array of offset values from the nvmpg offset format ++ * pointers, each of the pointer points to a per-namespace ++ * struct bch_nvmpg_recs. ++ */ ++ __u64 recs_offset[BCH_NVMPG_NS_MAX]; ++}; ++ ++/* heads[0] is always for nvm_pages internal usage */ ++struct bch_nvmpg_set_header { ++ union { ++ struct { ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_head heads[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_HEADS \ ++ ((sizeof(struct bch_nvmpg_set_header) - \ ++ offsetof(struct bch_nvmpg_set_header, heads)) / \ ++ sizeof(struct bch_nvmpg_head)) ++ ++/* The on-media bit order is local CPU order */ ++struct bch_nvmpg_sb { ++ __u64 csum; ++ __u64 sb_offset; ++ __u64 ns_start; ++ __u64 version; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 page_size; ++ __u32 total_ns; ++ __u32 this_ns; ++ union { ++ __u8 set_uuid[16]; ++ __u64 set_magic; ++ }; ++ ++ __u64 flags; ++ __u64 seq; ++ ++ __u64 feature_compat; ++ __u64 feature_incompat; ++ __u64 feature_ro_compat; ++ ++ /* For allocable nvm pages from buddy systems */ ++ __u64 pages_offset; ++ __u64 pages_total; ++ ++ __u64 pad[8]; ++ ++ /* ++ * A nvmpg offset format pointer, it points ++ * to struct bch_nvmpg_set_header which is ++ * stored only on the first name space. ++ */ ++ __u64 set_header_offset; ++ ++ /* Just for csum_set() */ ++ __u32 keys; ++ __u64 d[0]; ++}; ++ ++#endif /* _NVMPG_FORMAT_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch new file mode 100644 index 0000000..54243a6 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0002-bcache-initialize-the-nvm-pages-allocator.patch @@ -0,0 +1,542 @@ +From e75f8de4ca87db06507e173d795f42d1c98468d4 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 26 Jul 2021 10:33:30 +0800 +Subject: [PATCH v13 02/12] bcache: initialize the nvm pages allocator + +This patch define the prototype data structures in memory and +initializes the nvm pages allocator. + +The nvm address space which is managed by this allocator can consist of +many nvm namespaces, and some namespaces can compose into one nvm set, +like cache set. For this initial implementation, only one set can be +supported. + +The users of this nvm pages allocator need to call register_namespace() +to register the nvdimm device (like /dev/pmemX) into this allocator as +the instance of struct nvm_namespace. + +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/Kconfig | 10 ++ + drivers/md/bcache/Makefile | 1 + + drivers/md/bcache/nvmpg.c | 340 +++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 97 +++++++++++ + drivers/md/bcache/super.c | 3 + + 5 files changed, 451 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg.c + create mode 100644 drivers/md/bcache/nvmpg.h + +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..4a7c13e882bb 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. ++ ++config BCACHE_NVM_PAGES ++ bool "NVDIMM support for bcache (EXPERIMENTAL)" ++ depends on BCACHE ++ depends on 64BIT ++ depends on LIBNVDIMM ++ depends on DAX ++ help ++ Allocate/release NV-memory pages for bcache and provide allocated pages ++ for each requestor after system reboot. +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..276b33be5ad5 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o + bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o ++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +new file mode 100644 +index 000000000000..b654bbbda03e +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.c +@@ -0,0 +1,340 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Nvdimm page-buddy allocator ++ * ++ * Copyright (c) 2021, Intel Corporation. ++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>. ++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>. ++ */ ++ ++#include "bcache.h" ++#include "nvmpg.h" ++ ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/mutex.h> ++#include <linux/dax.h> ++#include <linux/pfn_t.h> ++#include <linux/libnvdimm.h> ++#include <linux/mm_types.h> ++#include <linux/err.h> ++#include <linux/pagemap.h> ++#include <linux/bitmap.h> ++#include <linux/blkdev.h> ++ ++struct bch_nvmpg_set *global_nvmpg_set; ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ int ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (offset == 0) ++ return NULL; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (ns) ++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset)); ++ ++ pr_err("Invalid ns_id %u\n", ns_id); ++ return NULL; ++} ++ ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = (unsigned long)(ptr - ns->base_addr); ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++static void release_ns_tbl(struct bch_nvmpg_set *set) ++{ ++ int i; ++ struct bch_nvmpg_ns *ns; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ ns = set->ns_tbl[i]; ++ if (ns) { ++ fs_put_dax(ns->dax_dev); ++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); ++ set->ns_tbl[i] = NULL; ++ set->attached_ns--; ++ kfree(ns); ++ } ++ } ++ ++ if (set->attached_ns) ++ pr_err("unexpected attached_ns: %u\n", set->attached_ns); ++} ++ ++static void release_nvmpg_set(struct bch_nvmpg_set *set) ++{ ++ release_ns_tbl(set); ++ kfree(set); ++} ++ ++/* Namespace 0 contains all meta data of the nvmpg allocation set */ ++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_set_header *set_header; ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n", ++ ns->ns_id); ++ return -EINVAL; ++ } ++ ++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset); ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ global_nvmpg_set->set_header = set_header; ++ global_nvmpg_set->heads_size = set_header->size; ++ global_nvmpg_set->heads_used = set_header->used; ++ mutex_unlock(&global_nvmpg_set->lock); ++ ++ return 0; ++} ++ ++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_sb *sb = ns->sb; ++ int rc = 0; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) { ++ pr_err("ns_id %u already attached.\n", ns->ns_id); ++ rc = -EEXIST; ++ goto unlock; ++ } ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ if (global_nvmpg_set->attached_ns > 0) { ++ pr_err("multiple namespace attaching not supported yet\n"); ++ rc = -EOPNOTSUPP; ++ goto unlock; ++ } ++ ++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) { ++ pr_err("namespace counters error: attached %u > total %u\n", ++ global_nvmpg_set->attached_ns, ++ global_nvmpg_set->total_ns); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16); ++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns; ++ global_nvmpg_set->attached_ns++; ++ global_nvmpg_set->total_ns = sb->total_ns; ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} ++ ++static int read_nvdimm_meta_super(struct block_device *bdev, ++ struct bch_nvmpg_ns *ns) ++{ ++ struct page *page; ++ struct bch_nvmpg_sb *sb; ++ uint64_t expected_csum = 0; ++ int r; ++ ++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, ++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); ++ ++ if (IS_ERR(page)) ++ return -EIO; ++ ++ sb = (struct bch_nvmpg_sb *) ++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET)); ++ ++ r = -EINVAL; ++ expected_csum = csum_set(sb); ++ if (expected_csum != sb->csum) { ++ pr_info("csum is not match with expected one\n"); ++ goto put_page; ++ } ++ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_info("invalid bch_nvmpg_magic\n"); ++ goto put_page; ++ } ++ ++ if (sb->sb_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) { ++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset); ++ goto put_page; ++ } ++ ++ r = -EOPNOTSUPP; ++ if (sb->total_ns != 1) { ++ pr_info("multiple name space not supported yet.\n"); ++ goto put_page; ++ } ++ ++ ++ r = 0; ++ /* Necessary for DAX mapping */ ++ ns->page_size = sb->page_size; ++ ns->pages_total = sb->pages_total; ++ ++put_page: ++ put_page(page); ++ return r; ++} ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ struct bch_nvmpg_ns *ns = NULL; ++ struct bch_nvmpg_sb *sb = NULL; ++ char buf[BDEVNAME_SIZE]; ++ struct block_device *bdev; ++ pgoff_t pgoff; ++ int id, err; ++ char *path; ++ long dax_ret = 0; ++ ++ path = kstrndup(dev_path, 512, GFP_KERNEL); ++ if (!path) { ++ pr_err("kstrndup failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ bdev = blkdev_get_by_path(strim(path), ++ FMODE_READ|FMODE_WRITE|FMODE_EXCL, ++ global_nvmpg_set); ++ if (IS_ERR(bdev)) { ++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev)); ++ kfree(path); ++ return ERR_PTR(PTR_ERR(bdev)); ++ } ++ ++ err = -ENOMEM; ++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL); ++ if (!ns) ++ goto bdput; ++ ++ err = -EIO; ++ if (read_nvdimm_meta_super(bdev, ns)) { ++ pr_err("%s read nvdimm meta super block failed.\n", ++ bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EOPNOTSUPP; ++ ns->dax_dev = fs_dax_get_by_bdev(bdev); ++ if (!ns->dax_dev) { ++ pr_err("can't get dax device by %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ if (!dax_supported(ns->dax_dev, bdev, ns->page_size, 0, ++ bdev_nr_sectors(bdev))) { ++ pr_err("%s don't support DAX\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) { ++ pr_err("invalid offset of %s\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EINVAL; ++ id = dax_read_lock(); ++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total, ++ &ns->base_addr, &ns->start_pfn); ++ if (dax_ret <= 0) { ++ pr_err("dax_direct_access error\n"); ++ dax_read_unlock(id); ++ goto free_ns; ++ } ++ ++ if (dax_ret < ns->pages_total) { ++ pr_warn("currently first %ld pages (from %lu in total) are used\n", ++ dax_ret, ns->pages_total); ++ } ++ dax_read_unlock(id); ++ ++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET); ++ ++ err = -EINVAL; ++ /* Check magic again to make sure DAX mapping is correct */ ++ if (memcmp(sb->magic, bch_nvmpg_magic, 16)) { ++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n"); ++ goto free_ns; ++ } ++ ++ if ((global_nvmpg_set->attached_ns > 0) && ++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) { ++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id); ++ goto free_ns; ++ } ++ ++ if (sb->set_header_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) { ++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n", ++ sb->this_ns, ++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset), ++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset)); ++ goto free_ns; ++ } ++ ++ ns->page_size = sb->page_size; ++ ns->pages_offset = sb->pages_offset; ++ ns->pages_total = sb->pages_total; ++ ns->sb = sb; ++ ns->free = 0; ++ ns->bdev = bdev; ++ ns->set = global_nvmpg_set; ++ ++ err = attach_nvmpg_set(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ mutex_init(&ns->lock); ++ ++ err = init_nvmpg_set_header(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ kfree(path); ++ return ns; ++ ++free_ns: ++ fs_put_dax(ns->dax_dev); ++ kfree(ns); ++bdput: ++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); ++ kfree(path); ++ return ERR_PTR(err); ++} ++ ++int __init bch_nvmpg_init(void) ++{ ++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL); ++ if (!global_nvmpg_set) ++ return -ENOMEM; ++ ++ global_nvmpg_set->total_ns = 0; ++ mutex_init(&global_nvmpg_set->lock); ++ ++ pr_info("bcache nvm init\n"); ++ return 0; ++} ++ ++void bch_nvmpg_exit(void) ++{ ++ release_nvmpg_set(global_nvmpg_set); ++ pr_info("bcache nvm exit\n"); ++} +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +new file mode 100644 +index 000000000000..698c890b2d15 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _BCACHE_NVM_PAGES_H ++#define _BCACHE_NVM_PAGES_H ++ ++#include <linux/libnvdimm.h> ++ ++#include "nvmpg_format.h" ++ ++/* ++ * Bcache NVDIMM in memory data structures ++ */ ++ ++/* ++ * The following three structures in memory records which page(s) allocated ++ * to which owner. After reboot from power failure, they will be initialized ++ * based on nvm pages superblock in NVDIMM device. ++ */ ++struct bch_nvmpg_ns { ++ struct bch_nvmpg_sb *sb; ++ void *base_addr; ++ ++ unsigned char uuid[16]; ++ int ns_id; ++ unsigned int page_size; ++ unsigned long free; ++ unsigned long pages_offset; ++ unsigned long pages_total; ++ pfn_t start_pfn; ++ ++ struct dax_device *dax_dev; ++ struct block_device *bdev; ++ struct bch_nvmpg_set *set; ++ ++ struct mutex lock; ++}; ++ ++/* ++ * A set of namespaces. Currently only one set can be supported. ++ */ ++struct bch_nvmpg_set { ++ unsigned char set_uuid[16]; ++ ++ int heads_size; ++ int heads_used; ++ struct bch_nvmpg_set_header *set_header; ++ ++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX]; ++ int total_ns; ++ int attached_ns; ++ ++ struct mutex lock; ++}; ++ ++#define BCH_NVMPG_NS_ID_BITS 3 ++#define BCH_NVMPG_OFFSET_BITS 61 ++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1) ++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1) ++ ++#define BCH_NVMPG_GET_NS_ID(offset) \ ++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK) ++ ++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK) ++ ++#define BCH_NVMPG_OFFSET(ns_id, offset) \ ++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \ ++ ((offset) & BCH_NVMPG_OFFSET_MASK)) ++ ++/* Indicate which field in bch_nvmpg_sb to be updated */ ++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); ++int bch_nvmpg_init(void); ++void bch_nvmpg_exit(void); ++ ++#else ++ ++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) ++{ ++ return NULL; ++} ++ ++static inline int bch_nvmpg_init(void) ++{ ++ return 0; ++} ++ ++static inline void bch_nvmpg_exit(void) { } ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++#endif /* _BCACHE_NVM_PAGES_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 86b9e355c583..74d51a0b806f 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -14,6 +14,7 @@ + #include "request.h" + #include "writeback.h" + #include "features.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/pagemap.h> +@@ -2818,6 +2819,7 @@ static void bcache_exit(void) + { + bch_debug_exit(); + bch_request_exit(); ++ bch_nvmpg_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) +@@ -2916,6 +2918,7 @@ static int __init bcache_init(void) + + bch_debug_init(); + closure_debug_init(); ++ bch_nvmpg_init(); + + bcache_is_reboot = false; + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch new file mode 100644 index 0000000..9adcb46 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0003-bcache-initialization-of-the-buddy.patch @@ -0,0 +1,359 @@ +From ef9ee14f2d7b1dd38f8aebf190e9ed1527f688c2 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:45:57 +0800 +Subject: [PATCH v13 03/12] bcache: initialization of the buddy + +This nvm pages allocator will implement the simple buddy allocator to +anage the nvm address space. This patch initializes this buddy allocator +for new namespace. + +the unit of alloc/free of the buddy allocator is page. DAX device has +their struct page(in dram or PMEM). + + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + void *zone_device_data; + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ + }; + +ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use. +So the second/third word will be used as 'struct list_head ' which list +in buddy. The fourth word(that is normal struct page::index) store pgoff +which the page-offset in the dax device. And the fifth word (that is +normal struct page::private) store order of buddy. page_type will be used +to store buddy flags. + +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 212 +++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/nvmpg.h | 12 +++ + 2 files changed, 221 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index b654bbbda03e..2b70ee4a6028 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + return BCH_NVMPG_OFFSET(ns_id, offset); + } + ++static struct page *bch_nvmpg_va_to_pg(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) ++{ ++ return ns->base_addr + (pgoff << PAGE_SHIFT); ++} ++ ++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) ++{ ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ pgoff_t pgoff = r->pgoff; ++ ++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++} ++ ++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff, u64 nr) ++{ ++ while (nr > 0) { ++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr; ++ ++ bitmap_set(ns->pages_bitmap, pgoff, num); ++ nr -= num; ++ pgoff += num; ++ } ++} ++ + static void release_ns_tbl(struct bch_nvmpg_set *set) + { + int i; +@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set) + for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { + ns = set->ns_tbl[i]; + if (ns) { ++ kvfree(ns->pages_bitmap); ++ if (ns->recs_bitmap) ++ bitmap_free(ns->recs_bitmap); ++ + fs_put_dax(ns->dax_dev); + blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + set->ns_tbl[i] = NULL; +@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set) + kfree(set); + } + ++static int validate_recs(int ns_id, ++ struct bch_nvmpg_head *head, ++ struct bch_nvmpg_recs *recs) ++{ ++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, 16)) { ++ pr_err("Invalid bch_nvmpg_recs magic\n"); ++ return -EINVAL; ++ } ++ ++ if (memcmp(recs->uuid, head->uuid, 16)) { ++ pr_err("Invalid bch_nvmpg_recs uuid\n"); ++ return -EINVAL; ++ } ++ ++ if (recs->head_offset != ++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) { ++ pr_err("Invalid recs head_offset\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs) ++{ ++ int i, used = 0; ++ ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *r = &recs->recs[i]; ++ struct bch_nvmpg_ns *ns; ++ struct page *page; ++ void *addr; ++ ++ if (r->pgoff == 0) ++ continue; ++ ++ ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ addr = bch_nvmpg_rec_to_ptr(r); ++ if (addr < ns->base_addr) { ++ pr_err("Invalid recorded address\n"); ++ return -EINVAL; ++ } ++ ++ /* init struct page: index/private */ ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, r->order); ++ page->index = r->pgoff; ++ ++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order); ++ used++; ++ } ++ ++ if (used != recs->used) { ++ pr_err("used %d doesn't match recs->used %d\n", ++ used, recs->used); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ + /* Namespace 0 contains all meta data of the nvmpg allocation set */ + static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_set_header *set_header; ++ struct bch_nvmpg_recs *sys_recs; ++ int i, j, used = 0, rc = 0; + + if (ns->ns_id != 0) { + pr_err("unexpected ns_id %u for first nvmpg namespace.\n", +@@ -93,9 +190,83 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + global_nvmpg_set->set_header = set_header; + global_nvmpg_set->heads_size = set_header->size; + global_nvmpg_set->heads_used = set_header->used; ++ ++ /* Reserve the used space from buddy allocator */ ++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size)); ++ ++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET; ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *head; ++ ++ head = &set_header->heads[i]; ++ if (head->state == BCH_NVMPG_HD_STAT_FREE) ++ continue; ++ ++ used++; ++ if (used > global_nvmpg_set->heads_size) { ++ pr_err("used heads %d > heads size %d.\n", ++ used, global_nvmpg_set->heads_size); ++ goto unlock; ++ } ++ ++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]); ++ ++ /* Iterate the recs list */ ++ while (recs) { ++ rc = validate_recs(j, head, recs); ++ if (rc < 0) ++ goto unlock; ++ ++ rc = reserve_nvmpg_recs(recs); ++ if (rc < 0) ++ goto unlock; ++ ++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1); ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ } ++ } ++unlock: + mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} + +- return 0; ++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) ++{ ++ unsigned int start, end, pages; ++ int i; ++ struct page *page; ++ pgoff_t pgoff_start; ++ ++ bitmap_for_each_clear_region(ns->pages_bitmap, ++ start, end, 0, ns->pages_total) { ++ pgoff_start = start; ++ pages = end - start; ++ ++ while (pages) { ++ void *addr; ++ ++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { ++ if ((pgoff_start % (1L << i) == 0) && ++ (pages >= (1L << i))) ++ break; ++ } ++ ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, i); ++ page->index = pgoff_start; ++ __SetPageBuddy(page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[i]); ++ ++ pgoff_start += 1L << i; ++ pages -= 1L << i; ++ } ++ } + } + + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) +@@ -200,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + char buf[BDEVNAME_SIZE]; + struct block_device *bdev; + pgoff_t pgoff; +- int id, err; ++ int id, i, err; + char *path; + long dax_ret = 0; + +@@ -304,13 +475,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + + mutex_init(&ns->lock); + ++ /* ++ * parameters of bitmap_set/clear are unsigned int. ++ * Given currently size of nvm is far from exceeding this limit, ++ * so only add a WARN_ON message. ++ */ ++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX); ++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!ns->pages_bitmap) { ++ err = -ENOMEM; ++ goto clear_ns_nr; ++ } ++ ++ if (ns->sb->this_ns == 0) { ++ ns->recs_bitmap = ++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL); ++ if (ns->recs_bitmap == NULL) { ++ err = -ENOMEM; ++ goto free_pages_bitmap; ++ } ++ } ++ ++ for (i = 0; i < BCH_MAX_ORDER; i++) ++ INIT_LIST_HEAD(&ns->free_area[i]); ++ + err = init_nvmpg_set_header(ns); + if (err < 0) +- goto free_ns; ++ goto free_recs_bitmap; ++ ++ if (ns->sb->this_ns == 0) ++ /* init buddy allocator */ ++ bch_nvmpg_init_free_space(ns); + + kfree(path); + return ns; + ++free_recs_bitmap: ++ bitmap_free(ns->recs_bitmap); ++free_pages_bitmap: ++ kvfree(ns->pages_bitmap); ++clear_ns_nr: ++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL; + free_ns: + fs_put_dax(ns->dax_dev); + kfree(ns); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 698c890b2d15..55778d4db7da 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -11,6 +11,8 @@ + * Bcache NVDIMM in memory data structures + */ + ++#define BCH_MAX_ORDER 20 ++ + /* + * The following three structures in memory records which page(s) allocated + * to which owner. After reboot from power failure, they will be initialized +@@ -28,6 +30,11 @@ struct bch_nvmpg_ns { + unsigned long pages_total; + pfn_t start_pfn; + ++ unsigned long *pages_bitmap; ++ struct list_head free_area[BCH_MAX_ORDER]; ++ ++ unsigned long *recs_bitmap; ++ + struct dax_device *dax_dev; + struct block_device *bdev; + struct bch_nvmpg_set *set; +@@ -69,6 +76,11 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_MAX_PGALLOC_RECS \ ++ (min_t(unsigned int, 64, \ ++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ ++ sizeof(struct bch_nvmpg_recs))) ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset); + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch new file mode 100644 index 0000000..ef13f6e --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch @@ -0,0 +1,308 @@ +From b09e24d84a7ae11be4bd7255648ebd5006678029 Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Wed, 4 Aug 2021 22:41:20 +0800 +Subject: [PATCH v13 04/12] bcache: bch_nvmpg_alloc_pages() of the buddy + +This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy +allocator. In terms of function, this func is like current +page-buddy-alloc. But the differences are: +a: it need owner_uuid as parameter which record owner info. And it +make those info persistence. +b: it don't need flags like GFP_*. All allocs are the equal. +c: it don't trigger other ops etc swap/recycle. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 221 ++++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 9 ++ + 2 files changed, 230 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 2b70ee4a6028..a920779eb548 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset) + return NULL; + } + ++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset) ++{ ++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT; ++} ++ + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + { + int ns_id = ns->ns_id; +@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) + return ns->base_addr + (pgoff << PAGE_SHIFT); + } + ++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = pgoff << PAGE_SHIFT; ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ + static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + { + struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; +@@ -269,6 +283,213 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + } + } + ++ ++/* If not found, it will create if create == true */ ++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create) ++{ ++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header; ++ struct bch_nvmpg_head *head = NULL; ++ int i; ++ ++ if (set_header == NULL) ++ goto out; ++ ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *h = &set_header->heads[i]; ++ ++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC) ++ continue; ++ ++ if (!memcmp(uuid, h->uuid, 16)) { ++ head = h; ++ break; ++ } ++ } ++ ++ if (!head && create) { ++ u32 used = set_header->used; ++ ++ if (set_header->size > used) { ++ head = &set_header->heads[used]; ++ memset(head, 0, sizeof(struct bch_nvmpg_head)); ++ head->state = BCH_NVMPG_HD_STAT_ALLOC; ++ memcpy(head->uuid, uuid, 16); ++ global_nvmpg_set->heads_used++; ++ set_header->used++; ++ } else ++ pr_info("No free bch_nvmpg_head\n"); ++ } ++ ++out: ++ return head; ++} ++ ++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void) ++{ ++ unsigned int start; ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0]; ++ struct bch_nvmpg_recs *recs; ++ ++ start = bitmap_find_next_zero_area(ns->recs_bitmap, ++ BCH_MAX_PGALLOC_RECS, 0, 1, 0); ++ if (start > BCH_MAX_PGALLOC_RECS) { ++ pr_info("No free struct bch_nvmpg_recs\n"); ++ return NULL; ++ } ++ ++ bitmap_set(ns->recs_bitmap, start, 1); ++ recs = (struct bch_nvmpg_recs *) ++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET) ++ + start; ++ ++ memset(recs, 0, sizeof(struct bch_nvmpg_recs)); ++ return recs; ++} ++ ++ ++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_head *head, ++ bool create) ++{ ++ int ns_id = ns->sb->this_ns; ++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]); ++ ++ /* If create=false, we return recs[nr] */ ++ if (!create) ++ return recs; ++ ++ /* ++ * If create=true, it mean we need a empty struct bch_nvmpg_rec ++ * So we should find non-empty struct bch_nvmpg_recs or alloc ++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs ++ */ ++ while (recs && (recs->used == recs->size)) { ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ ++ /* Found empty struct bch_nvmpg_recs */ ++ if (recs) ++ return recs; ++ ++ /* Need alloc new struct bch_nvmpg_recs */ ++ recs = find_empty_nvmpg_recs(); ++ if (recs) { ++ unsigned long offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head); ++ memcpy(recs->magic, bch_nvmpg_recs_magic, 16); ++ memcpy(recs->uuid, head->uuid, 16); ++ recs->size = BCH_NVMPG_MAX_RECS; ++ recs->used = 0; ++ ++ offset = bch_nvmpg_ptr_to_offset(ns, recs); ++ if (prev_recs) ++ prev_recs->next_offset = offset; ++ else ++ head->recs_offset[ns_id] = offset; ++ } ++ ++ return recs; ++} ++ ++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_recs *recs, ++ unsigned long nvmpg_offset, ++ int order) ++{ ++ int i, ns_id; ++ unsigned long pgoff; ++ ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ns_id = ns->sb->this_ns; ++ ++ for (i = 0; i < recs->size; i++) { ++ if (recs->recs[i].pgoff == 0) { ++ recs->recs[i].pgoff = pgoff; ++ recs->recs[i].order = order; ++ recs->recs[i].ns_id = ns_id; ++ recs->used++; ++ break; ++ } ++ } ++ BUG_ON(i == recs->size); ++} ++ ++ ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ unsigned long nvmpg_offset = 0; ++ struct bch_nvmpg_head *head; ++ int n, o; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ head = find_nvmpg_head(uuid, true); ++ ++ if (!head) { ++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n"); ++ goto unlock; ++ } ++ ++ for (n = 0; n < global_nvmpg_set->total_ns; n++) { ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n]; ++ ++ if (!ns || (ns->free < (1L << order))) ++ continue; ++ ++ for (o = order; o < BCH_MAX_ORDER; o++) { ++ struct list_head *list; ++ struct page *page, *buddy_page; ++ ++ if (list_empty(&ns->free_area[o])) ++ continue; ++ ++ list = ns->free_area[o].next; ++ page = container_of((void *)list, struct page, ++ zone_device_data); ++ ++ list_del(list); ++ ++ while (o != order) { ++ void *addr; ++ pgoff_t pgoff; ++ ++ pgoff = page->index + (1L << (o - 1)); ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(buddy_page, o - 1); ++ buddy_page->index = pgoff; ++ __SetPageBuddy(buddy_page); ++ list_add((struct list_head *)&buddy_page->zone_device_data, ++ &ns->free_area[o - 1]); ++ o--; ++ } ++ ++ set_page_private(page, order); ++ __ClearPageBuddy(page); ++ ns->free -= 1L << order; ++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index); ++ break; ++ } ++ ++ if (o < BCH_MAX_ORDER) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = find_nvmpg_recs(ns, head, true); ++ /* ToDo: handle pgalloc_recs==NULL */ ++ add_nvmpg_rec(ns, recs, nvmpg_offset, order); ++ break; ++ } ++ } ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return nvmpg_offset; ++} ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 55778d4db7da..d03f3241b45a 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -76,6 +76,9 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_PGOFF_TO_KVADDR(pgoff) \ ++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT)) ++ + #define BCH_MAX_PGALLOC_RECS \ + (min_t(unsigned int, 64, \ + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ +@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + + #else + +@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void) + + static inline void bch_nvmpg_exit(void) { } + ++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ return 0; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch new file mode 100644 index 0000000..fd631ae --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocat.patch @@ -0,0 +1,252 @@ +From 1f1fd2517b0a3520ab3a78cabe737cfb1f628d2e Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:06:35 +0800 +Subject: [PATCH v13 05/12] bcache: bch_nvmpg_free_pages() of the buddy + allocator + +This patch implements the bch_nvmpg_free_pages() of the buddy allocator. + +The difference between this and page-buddy-free: +it need owner_uuid to free owner allocated pages, and must +persistent after free. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 164 ++++++++++++++++++++++++++++++++++++-- + drivers/md/bcache/nvmpg.h | 3 + + 2 files changed, 160 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index a920779eb548..8ce0c4389b42 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + return rc; + } + ++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset, ++ int order) ++{ ++ unsigned long add_pages = (1L << order); ++ pgoff_t pgoff; ++ struct page *page; ++ void *va; ++ ++ if (nvmpg_offset == 0) { ++ pr_err("free pages on offset 0\n"); ++ return; ++ } ++ ++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset)); ++ WARN_ON((!page) || (page->private != order)); ++ pgoff = page->index; ++ ++ while (order < BCH_MAX_ORDER - 1) { ++ struct page *buddy_page; ++ ++ pgoff_t buddy_pgoff = pgoff ^ (1L << order); ++ pgoff_t parent_pgoff = pgoff & ~(1L << order); ++ ++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total)) ++ break; ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!buddy_page); ++ ++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) { ++ list_del((struct list_head *)&buddy_page->zone_device_data); ++ __ClearPageBuddy(buddy_page); ++ pgoff = parent_pgoff; ++ order++; ++ continue; ++ } ++ break; ++ } ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[order]); ++ page->index = pgoff; ++ set_page_private(page, order); ++ __SetPageBuddy(page); ++ ns->free += add_pages; ++} ++ + static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + { + unsigned int start, end, pages; +@@ -261,21 +312,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + pages = end - start; + + while (pages) { +- void *addr; +- + for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { + if ((pgoff_start % (1L << i) == 0) && + (pages >= (1L << i))) + break; + } + +- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); +- page = bch_nvmpg_va_to_pg(addr); ++ page = bch_nvmpg_va_to_pg( ++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start)); + set_page_private(page, i); + page->index = pgoff_start; +- __SetPageBuddy(page); +- list_add((struct list_head *)&page->zone_device_data, +- &ns->free_area[i]); ++ ++ /* In order to update ns->free */ ++ __free_space(ns, pgoff_start, i); + + pgoff_start += 1L << i; + pages -= 1L << i; +@@ -490,6 +539,106 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return nvmpg_offset; + } + ++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) ++{ ++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT); ++} ++ ++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns, ++ void *start_addr, void *end_addr) ++{ ++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns)); ++} ++ ++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id, ++ unsigned long nvmpg_offset, int order) ++{ ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *prev_recs, *sys_recs; ++ struct bch_nvmpg_ns *ns; ++ unsigned long pgoff; ++ int i; ++ ++ ns = global_nvmpg_set->ns_tbl[0]; ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ++ head = bch_nvmpg_offset_to_ptr(recs->head_offset); ++ prev_recs = recs; ++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET); ++ while (recs) { ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *rec = &(recs->recs[i]); ++ ++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) { ++ WARN_ON(rec->order != order); ++ rec->_v = 0; ++ recs->used--; ++ ++ if (recs->used == 0) { ++ int recs_pos = recs - sys_recs; ++ ++ if (recs == prev_recs) ++ head->recs_offset[ns_id] = ++ recs->next_offset; ++ else ++ prev_recs->next_offset = ++ recs->next_offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = 0; ++ ++ bitmap_clear(ns->recs_bitmap, recs_pos, 1); ++ } ++ goto out; ++ } ++ } ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++out: ++ return (recs ? 0 : -ENOENT); ++} ++ ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, ++ const char *uuid) ++{ ++ struct bch_nvmpg_ns *ns; ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *recs; ++ int r; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)]; ++ if (!ns) { ++ pr_err("can't find namespace by given kaddr from namespace\n"); ++ goto unlock; ++ } ++ ++ head = find_nvmpg_head(uuid, false); ++ if (!head) { ++ pr_err("can't found bch_nvmpg_head by uuid\n"); ++ goto unlock; ++ } ++ ++ recs = find_nvmpg_recs(ns, head, false); ++ if (!recs) { ++ pr_err("can't find bch_nvmpg_recs by uuid\n"); ++ goto unlock; ++ } ++ ++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order); ++ if (r < 0) { ++ pr_err("can't find bch_nvmpg_rec\n"); ++ goto unlock; ++ } ++ ++ __free_space(ns, nvmpg_offset, order); ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++} ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +@@ -686,6 +835,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path) + ns->pages_offset = sb->pages_offset; + ns->pages_total = sb->pages_total; + ns->sb = sb; ++ /* increase by __free_space() */ + ns->free = 0; + ns->bdev = bdev; + ns->set = global_nvmpg_set; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index d03f3241b45a..e089936e7f13 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + + #else + +@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return 0; + } + ++static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch new file mode 100644 index 0000000..f055b17 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0006-bcache-get-recs-list-head-for-allocated-pages-by.patch @@ -0,0 +1,66 @@ +From 953f817e496a1a74b9a8403800bf1d7f0f5b4aeb Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 21:06:03 +0800 +Subject: [PATCH v13 06/12] bcache: get recs list head for allocated pages by + specific uuid + +This patch implements bch_get_nvmpg_head() of the buddy allocator +to be used to get recs list head for allocated pages by specific +uuid. Then the requester (owner) can find all previous allocated +nvdimm pages by iterating the recs list. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 5 +++++ + drivers/md/bcache/nvmpg.h | 6 ++++++ + 2 files changed, 11 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 8ce0c4389b42..e26c7b578a62 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -539,6 +539,11 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return nvmpg_offset; + } + ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return find_nvmpg_head(uuid, false); ++} ++ + static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) + { + return ns->base_addr + (ns->pages_total << PAGE_SHIFT); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index e089936e7f13..2361cabf18be 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -94,6 +94,7 @@ int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + + #else + +@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + + static inline void bch_nvmpg_free_pages(void *addr, int order, const char *uuid) { } + ++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch new file mode 100644 index 0000000..4ae5f06 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-.patch @@ -0,0 +1,48 @@ +From 566cc2016c7e817b8306db96d97c3e4cdbc254df Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:17:02 +0800 +Subject: [PATCH v13 07/12] bcache: use bucket index to set GC_MARK_METADATA + for journal buckets in bch_btree_gc_finish() + +Currently the meta data bucket locations on cache device are reserved +after the meta data stored on NVDIMM pages, for the meta data layout +consistentcy temporarily. So these buckets are still marked as meta data +by SET_GC_MARK() in bch_btree_gc_finish(). + +When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear +address of NVDIMM pages and not bucket index anymore. Therefore we +should avoid to find bucket index from sb.d[], and directly use bucket +index from ca->sb.first_bucket to (ca->sb.first_bucket + +ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/btree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 88c573eeb598..1a0ff117373f 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c) + ca = c->cache; + ca->invalidate_needs_gc = 0; + +- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) +- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); ++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */ ++ for (i = ca->sb.first_bucket; ++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++) ++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch new file mode 100644 index 0000000..1e0fb3b --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0008-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into.patch @@ -0,0 +1,60 @@ +From 5da7b9cfe8c6344a6a4271bf3878d22ba87f4398 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:18:31 +0800 +Subject: [PATCH v13 08/12] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into + incompat feature set + +This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the +incompat feature set. When this bit is set by bcache-tools, it indicates +bcache meta data should be stored on specific NVDIMM meta device. + +The bcache meta data mainly includes journal and btree nodes, when this +bit is set in incompat feature set, bcache will ask the nvm-pages +allocator for NVDIMM space to store the meta data. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/features.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h +index 09161b89c63e..fab92678be76 100644 +--- a/drivers/md/bcache/features.h ++++ b/drivers/md/bcache/features.h +@@ -18,11 +18,19 @@ + #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 + /* real bucket size is (1 << bucket_size) */ + #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 ++/* store bcache meta data on nvdimm */ ++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004 + + #define BCH_FEATURE_COMPAT_SUPP 0 + #define BCH_FEATURE_RO_COMPAT_SUPP 0 ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ ++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \ ++ BCH_FEATURE_INCOMPAT_NVDIMM_META) ++#else + #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) ++#endif + + #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ + + BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); + BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); ++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META); + + static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) + { +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch new file mode 100644 index 0000000..3e63f08 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0009-bcache-initialize-bcache-journal-for-NVDIMM-meta.patch @@ -0,0 +1,255 @@ +From 6795c385696ab16a78e7b9cce7310a50a2522af5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 21 Oct 2021 21:39:18 +0800 +Subject: [PATCH v13 09/12] bcache: initialize bcache journal for NVDIMM meta + device + +The nvm-pages allocator may store and index the NVDIMM pages allocated +for bcache journal. This patch adds the initialization to store bcache +journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is +set by bcache-tools. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space() +will return the nvmpg_offset of NVDIMM pages for bcache journal, +- If there is previously allocated space, find it from nvm-pages owner + list and return to bch_journal_init(). +- If there is no previously allocated space, require a new NVDIMM range + from the nvm-pages allocator, and return it to bch_journal_init(). + +And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg +offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to +iterate all journal buckets. + +Later when bcache journaling code stores the journaling jset, the target +NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used +to calculate the linear address in memory copy from DRAM pages into +NVDIMM pages. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 113 ++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/journal.h | 2 +- + drivers/md/bcache/nvmpg.c | 9 +++ + drivers/md/bcache/nvmpg.h | 1 + + drivers/md/bcache/super.c | 18 +++--- + 5 files changed, 132 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 61bd79babf7a..d887557c718e 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -9,6 +9,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "nvmpg.h" ++#include "features.h" + + #include <trace/events/bcache.h> + +@@ -982,3 +984,114 @@ int bch_journal_alloc(struct cache_set *c) + + return 0; + } ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head, ++ struct cache *ca) ++{ ++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id; ++ unsigned long ret_offset = 0; ++ int i; ++ ++ jnl_offset = (unsigned long)ca->sb.d[0]; ++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset); ++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ struct bch_nvmpg_recs *recs; ++ struct bch_nvmpg_rec *rec; ++ unsigned long recs_offset = 0; ++ int j; ++ ++ recs_offset = nvmpg_head->recs_offset[i]; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ while (recs) { ++ for (j = 0; j < recs->size; j++) { ++ rec = &recs->recs[j]; ++ if ((rec->pgoff != jnl_pgoff) || ++ (rec->ns_id != jnl_ns_id)) ++ continue; ++ ++ ret_offset = jnl_offset; ++ goto out; ++ } ++ recs_offset = recs->next_offset; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ } ++ } ++ ++out: ++ return ret_offset; ++} ++ ++static unsigned long get_journal_nvmpg_space(struct cache *ca) ++{ ++ struct bch_nvmpg_head *head = NULL; ++ unsigned long nvmpg_offset; ++ int order; ++ ++ head = bch_get_nvmpg_head(ca->sb.set_uuid); ++ if (head) { ++ nvmpg_offset = find_journal_nvmpg_base(head, ca); ++ if (nvmpg_offset) ++ goto found; ++ } ++ ++ order = ilog2((ca->sb.bucket_size * ++ ca->sb.njournal_buckets) / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid); ++ if (nvmpg_offset) ++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), ++ 0, (1 << order) * PAGE_SIZE); ++found: ++ return nvmpg_offset; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static int __bch_journal_nvdimm_init(struct cache *ca) ++{ ++ int ret = -1; ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ int i; ++ unsigned long jnl_base = 0; ++ ++ jnl_base = get_journal_nvmpg_space(ca); ++ if (!jnl_base) { ++ pr_err("Failed to get journal space from nvdimm\n"); ++ goto out; ++ } ++ ++ /* Iniialized and reloaded from on-disk super block already */ ++ if (ca->sb.d[0] != 0) ++ goto out; ++ ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i); ++ ++ ret = 0; ++out: ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++ return ret; ++} ++ ++ ++int bch_journal_init(struct cache_set *c) ++{ ++ int i, ret = 0; ++ struct cache *ca = c->cache; ++ ++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, ++ 2, SB_JOURNAL_BUCKETS); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) { ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = ca->sb.first_bucket + i; ++ } else ++ ret = __bch_journal_nvdimm_init(ca); ++ ++ return ret; ++} +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..e3a7fa5a8fda 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -179,7 +179,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list); + void bch_journal_meta(struct cache_set *c, struct closure *cl); + int bch_journal_read(struct cache_set *c, struct list_head *list); + int bch_journal_replay(struct cache_set *c, struct list_head *list); +- ++int bch_journal_init(struct cache_set *c); + void bch_journal_free(struct cache_set *c); + int bch_journal_alloc(struct cache_set *c); + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index e26c7b578a62..1a3c6327b091 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -24,6 +24,15 @@ + + struct bch_nvmpg_set *global_nvmpg_set; + ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX)) ++ return global_nvmpg_set->ns_tbl[ns_id]; ++ ++ pr_emerg("Invalid ns_id: %d\n", ns_id); ++ return NULL; ++} ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset) + { + int ns_id = BCH_NVMPG_GET_NS_ID(offset); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 2361cabf18be..f7b7177cced3 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -95,6 +95,7 @@ void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); + + #else + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 74d51a0b806f..a27fa65d8832 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -147,9 +147,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * + goto err; + + err = "Journal buckets not sequential"; +- for (i = 0; i < sb->keys; i++) +- if (sb->d[i] != sb->first_bucket + i) +- goto err; ++ if (!bch_has_feature_nvdimm_meta(sb)) { ++ for (i = 0; i < sb->keys; i++) ++ if (sb->d[i] != sb->first_bucket + i) ++ goto err; ++ } + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) +@@ -2068,14 +2070,10 @@ static int run_cache_set(struct cache_set *c) + if (bch_journal_replay(c, &journal)) + goto err; + } else { +- unsigned int j; +- + pr_notice("invalidating existing data\n"); +- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, +- 2, SB_JOURNAL_BUCKETS); +- +- for (j = 0; j < ca->sb.keys; j++) +- ca->sb.d[j] = ca->sb.first_bucket + j; ++ err = "error initializing journal"; ++ if (bch_journal_init(c)) ++ goto err; + + bch_initial_gc_finish(c); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch new file mode 100644 index 0000000..977fff6 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0010-bcache-support-storing-bcache-journal-into-NVDIM.patch @@ -0,0 +1,231 @@ +From 04919917230c65aa07f65a57a136f7994b017faf Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:45:23 +0800 +Subject: [PATCH v13 10/12] bcache: support storing bcache journal into NVDIMM + meta device + +This patch implements two methods to store bcache journal to, +1) __journal_write_unlocked() for block interface device + The latency method to compose bio and issue the jset bio to cache + device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache + device to store the journal jset. +2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM + Use memory interface to access NVDIMM pages and store the jset by + memcpy_flushcache(). c->journal.key.ptr[0] indicates the linear + address from the NVDIMM pages to store the journal jset. + +For legacy configuration without NVDIMM meta device, journal I/O is +handled by __journal_write_unlocked() with existing code logic. If the +NVDIMM meta device is used (by bcache-tools), the journal I/O will +be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM +pages. + +And when NVDIMM meta device is used, sb.d[] stores the linear addresses +from NVDIMM pages (no more bucket index), in journal_reclaim() the +journaling location in c->journal.key.ptr[0] should also be updated by +linear address from NVDIMM pages (no more LBA combined by sectors offset +and bucket index). + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 120 +++++++++++++++++++++++++----------- + drivers/md/bcache/super.c | 3 +- + 2 files changed, 85 insertions(+), 38 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index d887557c718e..7d5c5ed18890 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -596,6 +596,8 @@ static void do_journal_discard(struct cache *ca) + return; + } + ++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb)); ++ + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; +@@ -661,9 +663,16 @@ static void journal_reclaim(struct cache_set *c) + goto out; + + ja->cur_idx = next; +- k->ptr[0] = MAKE_PTR(0, +- bucket_to_sector(c, ca->sb.d[ja->cur_idx]), +- ca->sb.nr_this_dev); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ k->ptr[0] = MAKE_PTR(0, ++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ++ ca->sb.nr_this_dev); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr( ++ ca->sb.d[ja->cur_idx]); ++#endif ++ + atomic_long_inc(&c->reclaimed_journal_buckets); + + bkey_init(k); +@@ -729,46 +738,21 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + +-static void journal_write_unlocked(struct closure *cl) ++ ++static void __journal_write_unlocked(struct cache_set *c) + __releases(c->journal.lock) + { +- struct cache_set *c = container_of(cl, struct cache_set, journal.io); +- struct cache *ca = c->cache; +- struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; +- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * +- ca->sb.block_size; +- ++ struct journal_write *w = c->journal.cur; ++ struct closure *cl = &c->journal.io; ++ struct cache *ca = c->cache; + struct bio *bio; + struct bio_list list; ++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * ++ ca->sb.block_size; + + bio_list_init(&list); + +- if (!w->need_write) { +- closure_return_with_destructor(cl, journal_write_unlock); +- return; +- } else if (journal_full(&c->journal)) { +- journal_reclaim(c); +- spin_unlock(&c->journal.lock); +- +- btree_flush_write(c); +- continue_at(cl, journal_write, bch_journal_wq); +- return; +- } +- +- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); +- +- w->data->btree_level = c->root->level; +- +- bkey_copy(&w->data->btree_root, &c->root->key); +- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); +- +- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; +- w->data->magic = jset_magic(&ca->sb); +- w->data->version = BCACHE_JSET_VERSION; +- w->data->last_seq = last_seq(&c->journal); +- w->data->csum = csum_set(w->data); +- + for (i = 0; i < KEY_PTRS(k); i++) { + ca = c->cache; + bio = &ca->journal.bio; +@@ -793,7 +777,6 @@ static void journal_write_unlocked(struct closure *cl) + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } +- + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + +@@ -805,6 +788,71 @@ static void journal_write_unlocked(struct closure *cl) + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(c, bio, cl); ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static void __journal_nvdimm_write_unlocked(struct cache_set *c) ++ __releases(c->journal.lock) ++{ ++ struct journal_write *w = c->journal.cur; ++ struct cache *ca = c->cache; ++ unsigned int sectors; ++ ++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size; ++ atomic_long_add(sectors, &ca->meta_sectors_written); ++ ++ memcpy_flushcache((void *)c->journal.key.ptr[0], w->data, sectors << 9); ++ ++ c->journal.key.ptr[0] += sectors << 9; ++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq; ++ ++ atomic_dec_bug(&fifo_back(&c->journal.pin)); ++ bch_journal_next(&c->journal); ++ journal_reclaim(c); ++ ++ spin_unlock(&c->journal.lock); ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static void journal_write_unlocked(struct closure *cl) ++{ ++ struct cache_set *c = container_of(cl, struct cache_set, journal.io); ++ struct cache *ca = c->cache; ++ struct journal_write *w = c->journal.cur; ++ ++ if (!w->need_write) { ++ closure_return_with_destructor(cl, journal_write_unlock); ++ return; ++ } else if (journal_full(&c->journal)) { ++ journal_reclaim(c); ++ spin_unlock(&c->journal.lock); ++ ++ btree_flush_write(c); ++ continue_at(cl, journal_write, bch_journal_wq); ++ return; ++ } ++ ++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); ++ ++ w->data->btree_level = c->root->level; ++ ++ bkey_copy(&w->data->btree_root, &c->root->key); ++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); ++ ++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; ++ w->data->magic = jset_magic(&ca->sb); ++ w->data->version = BCACHE_JSET_VERSION; ++ w->data->last_seq = last_seq(&c->journal); ++ w->data->csum = csum_set(w->data); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ __journal_write_unlocked(c); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ __journal_nvdimm_write_unlocked(c); ++#endif + + continue_at(cl, journal_write_done, NULL); + } +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index a27fa65d8832..45b69ddc9cfa 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1679,7 +1679,7 @@ void bch_cache_set_release(struct kobject *kobj) + static void cache_set_free(struct closure *cl) + { + struct cache_set *c = container_of(cl, struct cache_set, cl); +- struct cache *ca; ++ struct cache *ca = c->cache; + + debugfs_remove(c->debug); + +@@ -1691,7 +1691,6 @@ static void cache_set_free(struct closure *cl) + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + +- ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch new file mode 100644 index 0000000..77ca2b5 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0011-bcache-read-jset-from-NVDIMM-pages-for-journal-r.patch @@ -0,0 +1,182 @@ +From 2e1f37377d63412b139e8aa55a8731bf95c91767 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:54:12 +0800 +Subject: [PATCH v13 11/12] bcache: read jset from NVDIMM pages for journal + replay + +This patch implements two methods to read jset from media for journal +replay, +- __jnl_rd_bkt() for block device + This is the legacy method to read jset via block device interface. +- __jnl_rd_nvm_bkt() for NVDIMM + This is the method to read jset from NVDIMM memory interface, a.k.a + memcopy() from NVDIMM pages to DRAM pages. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set, +during running cache set, journal_read_bucket() will read the journal +content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of +NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which +were initialized and maintained in previous runs of the cache set. + +A thing should be noticed is, when bch_journal_read() is called, the +linear address of NVDIMM pages is not loaded and initialized yet, it +is necessary to call __bch_journal_nvdimm_init() before reading the jset +from NVDIMM pages. + +The code comments added in journal_read_bucket() is noticed by kernel +test robot and Dan Carpenter, it explains why it is safe to only check +!bch_has_feature_nvdimm_meta() condition in the if() statement when +CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the +bogus warning message from static checking tool. + +Signed-off-by: Coly Li <colyli@suse.de> +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 88 ++++++++++++++++++++++++++++++------- + 1 file changed, 71 insertions(+), 17 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7d5c5ed18890..902992be9191 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -34,18 +34,60 @@ static void journal_read_endio(struct bio *bio) + closure_put(cl); + } + ++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset, ++ struct closure *cl) ++{ ++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]); ++ struct bio *bio = &ca->journal.bio; ++ struct jset *data = ca->set->journal.w[0].data; ++ ++ bio_reset(bio); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio_set_dev(bio, ca->bdev); ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = cl; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(ca->set, bio, cl); ++ closure_sync(cl); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset) ++{ ++ void *jset_addr; ++ struct jset *data; ++ ++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9); ++ data = ca->set->journal.w[0].data; ++ ++ memcpy(data, jset_addr, len << 9); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ + static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned int bucket_index) + { + struct journal_device *ja = &ca->journal; +- struct bio *bio = &ja->bio; + + struct journal_replay *i; +- struct jset *j, *data = ca->set->journal.w[0].data; ++ struct jset *j; + struct closure cl; + unsigned int len, left, offset = 0; + int ret = 0; +- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + +@@ -55,26 +97,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, + reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); + +- bio_reset(bio); +- bio->bi_iter.bi_sector = bucket + offset; +- bio_set_dev(bio, ca->bdev); +- bio->bi_iter.bi_size = len << 9; +- +- bio->bi_end_io = journal_read_endio; +- bio->bi_private = &cl; +- bio_set_op_attrs(bio, REQ_OP_READ, 0); +- bch_bio_map(bio, data); +- +- closure_bio_submit(ca->set, bio, &cl); +- closure_sync(&cl); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl); ++ /* ++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible ++ * support feature set, a cache device format with feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in ++ * read_super() by bch_has_unknown_incompat_features(). ++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is ++ * safe to ignore the bch_has_feature_nvdimm_meta() condition. ++ */ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset); ++#endif + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ +- +- j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); +@@ -170,6 +213,8 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int __bch_journal_nvdimm_init(struct cache *ca); ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -188,6 +233,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + unsigned int i, l, r, m; + uint64_t seq; + ++ /* ++ * Linear addresses of NVDIMM pages for journaling is not ++ * initialized yet, do it before read jset from NVDIMM pages. ++ */ ++ if (bch_has_feature_nvdimm_meta(&ca->sb)) { ++ if (__bch_journal_nvdimm_init(ca) < 0) ++ return -ENXIO; ++ } ++ + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + +-- +2.31.1 + diff --git a/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch new file mode 100644 index 0000000..b2f0330 --- /dev/null +++ b/for-next/nvmpg-bcache-journaling-v13/v13-0012-bcache-add-sysfs-interface-register_nvdimm_meta-.patch @@ -0,0 +1,84 @@ +From e1f37c78f682ca8d7d0dee51ee8a0ee884f92df5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:55:25 +0800 +Subject: [PATCH v13 12/12] bcache: add sysfs interface register_nvdimm_meta to + register NVDIMM meta device + +This patch adds a sysfs interface register_nvdimm_meta to register +NVDIMM meta device. The sysfs interface file only shows up when +CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by +bcache-tools can be registered into bcache by e.g., + echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 45b69ddc9cfa..2b9cde44879b 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2405,10 +2405,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); ++#endif + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); + kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta); ++#endif + + static bool bch_is_open_backing(dev_t dev) + { +@@ -2522,6 +2530,24 @@ static void register_device_async(struct async_reg_args *args) + queue_delayed_work(system_wq, &args->reg_work, 10); + } + ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr, ++ const char *buffer, size_t size) ++{ ++ ssize_t ret = size; ++ ++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer); ++ ++ if (IS_ERR(ns)) { ++ pr_err("register nvdimm namespace %s for meta device failed.\n", ++ buffer); ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++#endif ++ + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) + { +@@ -2864,6 +2890,9 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ &ksysfs_register_nvdimm_meta.attr, ++#endif + &ksysfs_pendings_cleanup.attr, + NULL + }; +-- +2.31.1 + diff --git a/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch new file mode 100644 index 0000000..d2727ea --- /dev/null +++ b/for-test/0001-bcache-avoid-unnecessary-soft-lockup-in-kworker-upda.patch @@ -0,0 +1,166 @@ +From 8ddc4c14ecef71ebc56d86ad0fd6721d348898d0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 29 Mar 2022 00:08:49 +0800 +Subject: [PATCH] bcache: avoid unnecessary soft lockup in kworker + update_writeback_rate() + +The kworker routine update_writeback_rate() is schedued to update the +writeback rate in every 5 seconds by default. Before calling +__update_writeback_rate() to do real job, semaphore dc->writeback_lock +should be held by the kworker routine. + +At the same time, bcache writeback thread routine bch_writeback_thread() +also needs to hold dc->writeback_lock before flushing dirty data back +into the backing device. If the dirty data set is large, it might be +very long time for bch_writeback_thread() to scan all dirty buckets and +releases dc->writeback_lock. In such case update_writeback_rate() can be +starved for long enough time so that kernel reports a soft lockup warn- +ing started like: + watchdog: BUG: soft lockup - CPU#246 stuck for 23s! [kworker/246:31:179713] + +Such soft lockup condition is unnecessary, because after the writeback +thread finishes its job and releases dc->writeback_lock, the kworker +update_writeback_rate() may continue to work and everything is fine +indeed. + +This patch avoids the unnecessary soft lockup by the following method, +- Add new members to struct cached_dev + - dc->retry_nr (0 by default) + - dc->retry_max (6 by default) +- In update_writeback_rate() call down_read_trylock(&dc->writeback_lock) + firstly, if it fails then lock contention happens. If dc->retry_nr is + smaller than dc->retry_max, increase 1 to dc->retry_nr, and reschedule + the kworker to retry after a bit long time. +- If lock contention happens and dc->retry_nr is equal to dc->retry_max, + no retry anymore and call down_read(&dc->writeback_lock) to wait for the + lock. + +By the above method, at worst case update_writeback_rate() may retry for +2+ minutes before blocking on dc->writeback_lock by calling down_read(). +For a 4TB cache device with 1TB dirty data, 90%+ of the unnecessary soft +lockup warning message can be avoided. + +When retrying to acquire dc->writeback_lock in update_writeback_rate(), +of course the writeback rate cannot be updated. It is fair, because when +the kworker is blocked on the lock contention of dc->writeback_lock, the +writeback rate cannot be updated neither. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 7 +++++ + drivers/md/bcache/writeback.c | 49 +++++++++++++++++++++++++++++++---- + 2 files changed, 51 insertions(+), 5 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 9ed9c955add7..82b86b874294 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -395,6 +395,13 @@ struct cached_dev { + atomic_t io_errors; + unsigned int error_limit; + unsigned int offline_seconds; ++ ++ /* ++ * Retry to update writeback_rate if contention happens for ++ * down_read(dc->writeback_lock) in update_writeback_rate() ++ */ ++ unsigned int retry_nr; ++ unsigned int retry_max; + }; + + enum alloc_reserve { +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 9ee0005874cd..dbe90b9b2940 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -214,6 +214,7 @@ static void update_writeback_rate(struct work_struct *work) + struct cached_dev, + writeback_rate_update); + struct cache_set *c = dc->disk.c; ++ bool contention = false; + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling +@@ -235,6 +236,7 @@ static void update_writeback_rate(struct work_struct *work) + return; + } + ++ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { + /* + * If the whole cache set is idle, set_at_max_writeback_rate() +@@ -243,13 +245,44 @@ static void update_writeback_rate(struct work_struct *work) + * in maximum writeback rate number(s). + */ + if (!set_at_max_writeback_rate(c, dc)) { +- down_read(&dc->writeback_lock); +- __update_writeback_rate(dc); +- update_gc_after_writeback(c); +- up_read(&dc->writeback_lock); ++ /* ++ * When contention happens on dc->writeback_lock with ++ * the writeback thread, this kwork may be blocked for ++ * very long time if there are too many dirty data to ++ * writeback, and kerne message will complain a (bogus) ++ * software lockup kernel message. To avoid potential ++ * starving, if down_read_trylock() fails, writeback ++ * rate updating will be skipped for dc->retry_max times ++ * at most while delay this worker a bit longer time. ++ * If dc->retry_max times are tried and the trylock ++ * still fails, then call down_read() to wait for ++ * dc->writeback_lock. ++ */ ++ if (!down_read_trylock((&dc->writeback_lock))) { ++ contention = true; ++ ++ if (dc->retry_nr < dc->retry_max) { ++ dc->retry_nr++; ++ } else { ++ down_read(&dc->writeback_lock); ++ dc->retry_nr = 0; ++ } ++ } ++ ++ if (!dc->retry_nr) { ++ __update_writeback_rate(dc); ++ update_gc_after_writeback(c); ++ up_read(&dc->writeback_lock); ++ } + } + } + ++ /* ++ * In case no lock contention on dc->writeback_lock happens since ++ * last retry, e.g. cache is clean or I/O idle for a while. ++ */ ++ if (!contention && dc->retry_nr) ++ dc->retry_nr = 0; + + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, +@@ -257,8 +290,10 @@ static void update_writeback_rate(struct work_struct *work) + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { ++ unsigned int scale = 1 + dc->retry_nr; ++ + schedule_delayed_work(&dc->writeback_rate_update, +- dc->writeback_rate_update_seconds * HZ); ++ dc->writeback_rate_update_seconds * scale * HZ); + } + + /* +@@ -1032,6 +1067,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) + dc->writeback_rate_fp_term_high = 1000; + dc->writeback_rate_i_term_inverse = 10000; + ++ /* For dc->writeback_lock contention in update_writeback_rate() */ ++ dc->retry_nr = 0; ++ dc->retry_max = 6; ++ + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); + } +-- +2.34.1 + diff --git a/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch new file mode 100644 index 0000000..b1b4bae --- /dev/null +++ b/for-test/badblocks/v4/backup/0001-badblocks-add-more-helper-structure-and-routines-in-.patch @@ -0,0 +1,92 @@ +From db29a2e95f4ad4ec1ba58a71203a60ebd867d8c9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 10:57:26 +0800 +Subject: [PATCH 1/6] badblocks: add more helper structure and routines in + badblocks.h + +This patch adds the following helper structure and routines into +badblocks.h, +- struct badblocks_context + This structure is used in improved badblocks code for bad table + iteration. +- BB_END() + The macro to culculate end LBA of a bad range record from bad + table. +- badblocks_full() and badblocks_empty() + The inline routines to check whether bad table is full or empty. +- set_changed() and clear_changed() + The inline routines to set and clear 'changed' tag from struct + badblocks. + +These new helper structure and routines can help to make the code more +clear, they will be used in the improved badblocks code in following +patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + include/linux/badblocks.h | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h +index 2426276b9bd3..e1a06bacb2a2 100644 +--- a/include/linux/badblocks.h ++++ b/include/linux/badblocks.h +@@ -15,6 +15,7 @@ + #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) + #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) + #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) ++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x)) + #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + + /* Bad block numbers are stored sorted in a single page. +@@ -41,6 +42,14 @@ struct badblocks { + sector_t size; /* in sectors */ + }; + ++struct badblocks_context { ++ sector_t start; ++ sector_t len; ++ int ack; ++ sector_t orig_start; ++ sector_t orig_len; ++}; ++ + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, +@@ -63,4 +72,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) + } + badblocks_exit(bb); + } ++ ++static inline int badblocks_full(struct badblocks *bb) ++{ ++ return (bb->count >= MAX_BADBLOCKS); ++} ++ ++static inline int badblocks_empty(struct badblocks *bb) ++{ ++ return (bb->count == 0); ++} ++ ++static inline void set_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 1) ++ bb->changed = 1; ++} ++ ++static inline void clear_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 0) ++ bb->changed = 0; ++} ++ + #endif +-- +2.31.1 + diff --git a/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch new file mode 100644 index 0000000..62198ee --- /dev/null +++ b/for-test/badblocks/v4/backup/0002-badblocks-add-helper-routines-for-badblock-ranges-ha.patch @@ -0,0 +1,456 @@ +From d24ea1527077d06b0b579bbf7d1128d94af15d70 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 17:16:57 +0800 +Subject: [PATCH 2/6] badblocks: add helper routines for badblock ranges + handling + +This patch adds several helper routines to improve badblock ranges +handling. These helper routines will be used later in the improved +version of badblocks_set()/badblocks_clear()/badblocks_check(). + +- Helpers prev_by_hint() and prev_badblocks() are used to find the bad + range from bad table which the searching range starts at or after. + +- The following helpers are to decide the relative layout between the + manipulating range and existing bad block range from bad table. + - can_merge_behind() + Return 'true' if the manipulating range can backward merge with the + bad block range. + - can_merge_front() + Return 'true' if the manipulating range can forward merge with the + bad block range. + - can_combine_front() + Return 'true' if two adjacent bad block ranges before the + manipulating range can be merged. + - overlap_front() + Return 'true' if the manipulating range exactly overlaps with the + bad block range in front of its range. + - overlap_behind() + Return 'true' if the manipulating range exactly overlaps with the + bad block range behind its range. + - can_front_overwrite() + Return 'true' if the manipulating range can forward overwrite the + bad block range in front of its range. + +- The following helpers are to add the manipulating range into the bad + block table. Different routine is called with the specific relative + layout between the maniplating range and other bad block range in the + bad block table. + - behind_merge() + Merge the maniplating range with the bad block range behind its + range, and return the number of merged length in unit of sector. + - front_merge() + Merge the maniplating range with the bad block range in front of + its range, and return the number of merged length in unit of sector. + - front_combine() + Combine the two adjacent bad block ranges before the manipulating + range into a larger one. + - front_overwrite() + Overwrite partial of whole bad block range which is in front of the + manipulating range. The overwrite may split existing bad block range + and generate more bad block ranges into the bad block table. + - insert_at() + Insert the manipulating range at a specific location in the bad + block table. + +All the above helpers are used in later patches to improve the bad block +ranges handling for badblocks_set()/badblocks_clear()/badblocks_check(). + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 374 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index d39056630d9c..e85a7cd23aad 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,380 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * Find the range starts at-or-before 's' from bad table. The search ++ * starts from index 'hint' and stops at index 'hint_end' from the bad ++ * table. ++ */ ++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) ++{ ++ int hint_end = hint + 2; ++ u64 *p = bb->page; ++ int ret = -1; ++ ++ while ((hint < hint_end) && ((hint + 1) <= bb->count) && ++ (BB_OFFSET(p[hint]) <= s)) { ++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { ++ ret = hint; ++ break; ++ } ++ hint++; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Find the range starts at-or-before bad->start. If 'hint' is provided ++ * (hint >= 0) then search in the bad table from hint firstly. It is ++ * very probably the wanted bad range can be found from the hint index, ++ * then the unnecessary while-loop iteration can be avoided. ++ */ ++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, ++ int hint) ++{ ++ sector_t s = bad->start; ++ int ret = -1; ++ int lo, hi; ++ u64 *p; ++ ++ if (!bb->count) ++ goto out; ++ ++ if (hint >= 0) { ++ ret = prev_by_hint(bb, s, hint); ++ if (ret >= 0) ++ goto out; ++ } ++ ++ lo = 0; ++ hi = bb->count; ++ p = bb->page; ++ ++ while (hi - lo > 1) { ++ int mid = (lo + hi)/2; ++ sector_t a = BB_OFFSET(p[mid]); ++ ++ if (a <= s) ++ lo = mid; ++ else ++ hi = mid; ++ } ++ ++ if (BB_OFFSET(p[lo]) <= s) ++ ret = lo; ++out: ++ return ret; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be backward merged ++ * with the bad range (from the bad table) index by 'behind'. ++ */ ++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ ++ if ((s <= BB_OFFSET(p[behind])) && ++ ((s + sectors) >= BB_OFFSET(p[behind])) && ++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && ++ BB_ACK(p[behind]) == ack) ++ return true; ++ return false; ++} ++ ++/* ++ * Do backward merge for range indicated by 'bad' and the bad range ++ * (from the bad table) indexed by 'behind'. The return value is merged ++ * sectors from bad->len. ++ */ ++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_OFFSET(p[behind])); ++ WARN_ON((s + sectors) < BB_OFFSET(p[behind])); ++ ++ if (s < BB_OFFSET(p[behind])) { ++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); ++ ++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s); ++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, ack); ++ } else { ++ merged = min_t(sector_t, sectors, BB_LEN(p[behind])); ++ } ++ ++ WARN_ON(merged == 0); ++ ++ return merged; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be forward ++ * merged with the bad range (from the bad table) indexed by 'prev'. ++ */ ++static bool can_merge_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ ++ if (BB_ACK(p[prev]) == ack && ++ (s < BB_END(p[prev]) || ++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) ++ return true; ++ return false; ++} ++ ++/* ++ * Do forward merge for range indicated by 'bad' and the bad range ++ * (from bad table) indexed by 'prev'. The return value is sectors ++ * merged from bad->len. ++ */ ++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_END(p[prev])); ++ ++ if (s < BB_END(p[prev])) { ++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); ++ } else { ++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); ++ if ((prev + 1) < bb->count && ++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { ++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); ++ } ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + merged, ack); ++ } ++ ++ return merged; ++} ++ ++/* ++ * 'Combine' is a special case which can_merge_front() is not able to ++ * handle: If a bad range (indexed by 'prev' from bad table) exactly ++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by ++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and ++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then ++ * these two bad range (from bad table) can be combined. ++ * ++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad ++ * table can be combined. ++ */ ++static bool can_combine_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if ((prev > 0) && ++ (BB_OFFSET(p[prev]) == bad->start) && ++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && ++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && ++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) ++ return true; ++ return false; ++} ++ ++/* ++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad ++ * table) into one larger bad range, and the new range is indexed by ++ * 'prev - 1'. ++ */ ++static void front_combine(struct badblocks *bb, int prev) ++{ ++ u64 *p = bb->page; ++ ++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), ++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), ++ BB_ACK(p[prev])); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly forward ++ * overlapped with the bad range (from bad table) indexed by 'front'. ++ * Exactly forward overlap means the bad range (from bad table) indexed ++ * by 'prev' does not cover the whole range indicated by 'bad'. ++ */ ++static bool overlap_front(struct badblocks *bb, int front, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start >= BB_OFFSET(p[front]) && ++ bad->start < BB_END(p[front])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly backward ++ * overlapped with the bad range (from bad table) indexed by 'behind'. ++ */ ++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start < BB_OFFSET(p[behind]) && ++ (bad->start + bad->len) > BB_OFFSET(p[behind])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can overwrite the bad ++ * range (from bad table) indexed by 'prev'. ++ * ++ * The range indicated by 'bad' can overwrite the bad range indexed by ++ * 'prev' when, ++ * 1) The whole range indicated by 'bad' can cover partial or whole bad ++ * range (from bad table) indexed by 'prev'. ++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad ++ * range 'prev'. ++ * ++ * If the overwriting doesn't cover the whole bad range (from bad table) ++ * indexed by 'prev', new range might be split from existing bad range, ++ * 1) The overwrite covers head or tail part of existing bad range, 1 ++ * extra bad range will be split and added into the bad table. ++ * 2) The overwrite covers middle of existing bad range, 2 extra bad ++ * ranges will be split (ahead and after the overwritten range) and ++ * added into the bad table. ++ * The number of extra split ranges of the overwriting is stored in ++ * 'extra' and returned for the caller. ++ */ ++static bool can_front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *extra) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(!overlap_front(bb, prev, bad)); ++ ++ if (BB_ACK(p[prev]) >= bad->ack) ++ return false; ++ ++ if (BB_END(p[prev]) <= (bad->start + bad->len)) { ++ len = BB_END(p[prev]) - bad->start; ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 0; ++ else ++ *extra = 1; ++ ++ bad->len = len; ++ } else { ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 1; ++ else ++ /* ++ * prev range will be split into two, beside the overwritten ++ * one, an extra slot needed from bad table. ++ */ ++ *extra = 2; ++ } ++ ++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Do the overwrite from the range indicated by 'bad' to the bad range ++ * (from bad table) indexed by 'prev'. ++ * The previously called can_front_overwrite() will provide how many ++ * extra bad range(s) might be split and added into the bad table. All ++ * the splitting cases in the bad table will be handled here. ++ */ ++static int front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int extra) ++{ ++ u64 *p = bb->page; ++ sector_t orig_end = BB_END(p[prev]); ++ int orig_ack = BB_ACK(p[prev]); ++ int n = extra; ++ ++ switch (extra) { ++ case 0: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), ++ bad->ack); ++ break; ++ case 1: ++ if (BB_OFFSET(p[prev]) == bad->start) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->len, bad->ack); ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start + bad->len, ++ orig_end - BB_END(p[prev]), ++ orig_ack); ++ } else { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ memmove(p + prev + 1 + n, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ } ++ break; ++ case 2: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ memmove(p + prev + 1 + n, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), ++ orig_end - BB_END(p[prev + 1]), ++ BB_ACK(p[prev])); ++ break; ++ default: ++ break; ++ } ++ ++ return bad->len; ++} ++ ++/* ++ * Explicitly insert a range indicated by 'bad' to the bad table, where ++ * the location is indexed by 'at'. ++ */ ++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(badblocks_full(bb)); ++ ++ len = min_t(sector_t, sectors, BB_MAX_LEN); ++ if (at < bb->count) ++ memmove(p + at + 1, p + at, (bb->count - at) * 8); ++ p[at] = BB_MAKE(s, len, ack); ++ ++ return len; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch new file mode 100644 index 0000000..31a7639 --- /dev/null +++ b/for-test/badblocks/v4/backup/0003-badblocks-improvement-badblocks_set-for-multiple-ran.patch @@ -0,0 +1,662 @@ +From b3bbd59d07b131df82410b615ed13a7c439bbd32 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 18:36:09 +0800 +Subject: [PATCH 3/6] badblocks: improvement badblocks_set() for multiple + ranges handling + +Recently I received a bug report that current badblocks code does not +properly handle multiple ranges. For example, + badblocks_set(bb, 32, 1, true); + badblocks_set(bb, 34, 1, true); + badblocks_set(bb, 36, 1, true); + badblocks_set(bb, 32, 12, true); +Then indeed badblocks_show() reports, + 32 3 + 36 1 +But the expected bad blocks table should be, + 32 12 +Obviously only the first 2 ranges are merged and badblocks_set() returns +and ignores the rest setting range. + +This behavior is improper, if the caller of badblocks_set() wants to set +a range of blocks into bad blocks table, all of the blocks in the range +should be handled even the previous part encountering failure. + +The desired way to set bad blocks range by badblocks_set() is, +- Set as many as blocks in the setting range into bad blocks table. +- Merge the bad blocks ranges and occupy as less as slots in the bad + blocks table. +- Fast. + +Indeed the above proposal is complicated, especially with the following +restrictions, +- The setting bad blocks range can be ackknowledged or not acknowledged. +- The bad blocks table size is limited. +- Memory allocation should be avoided. + +The basic idea of the patch is to categorize all possible bad blocks +range setting combinationsinto to much less simplified and more less +special conditions. Inside badblocks_set() there is an implicit loop +composed by jumping between labels 're_insert' and 'update_sectors'. No +matter how large the setting bad blocks range is, in every loop just a +minimized range from the head is handled by a pre-defined behavior from +one of the categorized conditions. The logic is simple and code flow is +manageable. + +The different relative layout between the setting range and existing bad +block range are checked and handled (merge, combine, overwrite, insert) +by the helpers in previous patch. This patch is to make all the helpers +work together with the above idea. + +This patch only has the algorithm improvement for badblocks_set(). There +are following patches contain improvement for badblocks_clear() and +badblocks_check(). But the algorithm in badblocks_set() is fundamental +and typical, other improvement in clear and check routines are based on +all the helpers and ideas in this patch. + +In order to make the change to be more clear for code review, this patch +does not directly modify existing badblocks_set(), and just add a new +one named _badblocks_set(). Later patch will remove current existing +badblocks_set() code and make it as a wrapper of _badblocks_set(). So +the new added change won't be mixed with deleted code, the code review +can be easier. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 561 ++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 541 insertions(+), 20 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index e85a7cd23aad..95dceed0da3c 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,322 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are ++ * identified by LBA addresses. ++ * ++ * When the caller of badblocks_set() wants to set a range of bad blocks, the ++ * setting range can be acked or unacked. And the setting range may merge, ++ * overwrite, skip the overlaypped already set range, depends on who they are ++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be ++ * more complicated when the setting range covers multiple already set bad block ++ * ranges, with restritctions of maximum length of each bad range and the bad ++ * table space limitation. ++ * ++ * It is difficut and unnecessary to take care of all the possible situations, ++ * for setting a large range of bad blocks, we can handle it by dividing the ++ * large range into smaller ones when encounter overlap, max range length or ++ * bad table full conditions. Every time only a smaller piece of the bad range ++ * is handled with a limited number of conditions how it is interacted with ++ * possible overlapped or adjacent already set bad block ranges. Then the hard ++ * complicated problem can be much simpler to habndle in proper way. ++ * ++ * When setting a range of bad blocks to the bad table, the simplified situations ++ * to be considered are, (The already set bad blocks ranges are naming with ++ * prefix E, and the setting bad blocks range is naming with prefix S) ++ * ++ * 1) A setting range is not overlapped or adjacent to any other already set bad ++ * block range. ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ +-------------+ ++ * | E1 | | E2 | ++ * +-------------+ +-------------+ ++ * For this situation if the bad blocks table is not full, just allocate a ++ * free slot from the bad blocks table to mark the setting range S. The ++ * result is, ++ * +-------------+ +--------+ +-------------+ ++ * | E1 | | S | | E2 | ++ * +-------------+ +--------+ +-------------+ ++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks ++ * range. ++ * 2.1) The setting range size < already set range size ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.1.2) If S is uncked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E. ++ * An extra slot from the bad blocks table will be allocated for S, and head ++ * of E will move to end of the inserted range E. The result is, ++ * +--------+----+ ++ * | S | E | ++ * +--------+----+ ++ * 2.2) The setting range size == already set range size ++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.2.2) If S is uncked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of ++ bad blocks range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.3) The setting range size > already set range size ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For such situation, the setting range S can be treated as two parts, the ++ * first part (S1) is as same size as the already set range E, the second ++ * part (S2) is the rest of setting range. ++ * +-------------+-----+ +-------------+ +-----+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +-------------+-----+ ===> +-------------+ +-----+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now we only focus on how to handle the setting range S1 and already set ++ * range E, which are already explained in 1.2), for the rest S2 it will be ++ * handled later in next loop. ++ * 3) A setting range starts before the start LBA of an already set bad blocks ++ * range. ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation, the setting range S can be divided into two parts, the ++ * first (S1) ends at the start LBA of already set range E, the second part ++ * (S2) starts exactly at a start LBA of the already set range E. ++ * +----+---------+ +----+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +----+---------+ ===> +----+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now only the first part S1 should be handled in this loop, which is in ++ * similar condition as 1). The rest part S2 has exact same start LBA address ++ * of the already set range E, they will be handled in next loop in one of ++ * situations in 2). ++ * 4) A setting range starts after the start LBA of an already set bad blocks ++ * range. ++ * 4.1) If the setting range S exactly matches the tail part of already set bad ++ * blocks range E, like the following chart shows, ++ * +---------+ ++ * | S | ++ * +---------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may ++ * overwrite the overlapped range of E, the result is, ++ * +---+---------+ ++ * | E | S | ++ * +---+---------+ ++ * 4.2) If the setting range S stays in middle of an already set range E, like ++ * the following chart shows, ++ * +----+ ++ * | S | ++ * +----+ ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is also, ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will ++ * inserted into middle of E and split previous range E into twp parts (E1 ++ * and E2), the result is, ++ * +----+----+----+ ++ * | E1 | S | E2 | ++ * +----+----+----+ ++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad ++ * blocks range E. The range S starts after the start LBA of range E, and ++ * ends after the end LBA of range E, as the following chart shows, ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation the range S can be divided into two parts, the first ++ * part (S1) ends at end range E, and the second part (S2) has rest range of ++ * origin S. ++ * +---------+---------+ +---------+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +---------+---------+ ===> +---------+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now in this loop the setting range S1 and already set range E can be ++ * handled as the situations 4), the rest range S2 will be handled in next ++ * loop and ignored in this loop. ++ * 5) A setting bad blocks range S is adjacent to one or more already set bad ++ * blocks range(s), and they are all acked or unacked range. ++ * 5.1) Front merge: If the already set bad blocks range E is before setting ++ * range S and they are adjacent, ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can front merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting ++ * range S right after already set range E into the bad blocks table. The ++ * result is, ++ * +--------+------+ ++ * | E | S | ++ * +--------+------+ ++ * 6) Special cases which above conditions cannot handle ++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table ++ * +-------------------------------------------------------+ ++ * | S | ++ * +-------------------------------------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+ +-----+ +-----+ ++ * | E1 | | E2 | | E3 | ++ * +-----+ +-----+ +-----+ ++ * In the above example, when the bad blocks table is full, inserting the ++ * first part of setting range S will fail because no more available slot ++ * can be allocated from bad blocks table. In this situation a proper ++ * setting method should be go though all the setting bad blocks range and ++ * look for chance to merge already set ranges into less ones. When there ++ * is available slot from bad blocks table, re-try again to handle more ++ * setting bad blocks ranges as many as possible. ++ * +------------------------+ ++ * | S3 | ++ * +------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+-----+-----+---+-----+--+ ++ * | S1 | S2 | ++ * +-----+-----+-----+---+-----+--+ ++ * The above chart shows although the first part (S3) cannot be inserted due ++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges ++ * can be merged with rest part of S into less range S1 and S2. Now there is ++ * 1 free slot in bad blocks table. ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * | S3 | S1 | S2 | ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * Since the bad blocks table is not full anymore, re-try again for the ++ * origin setting range S. Now the setting range S3 can be inserted into the ++ * bad blocks table with previous freed slot from multiple ranges merge. ++ * 6.2) Front merge after overwrite ++ * In the following example, in bad blocks table, E1 is an acked bad blocks ++ * range and E2 is an unacked bad blocks range, therefore they are not able ++ * to merge into a larger range. The setting bad blocks range S is acked, ++ * therefore part of E2 can be overwritten by S. ++ * +--------+ ++ * | S | acknowledged ++ * +--------+ S: 1 ++ * +-------+-------------+ E1: 1 ++ * | E1 | E2 | E2: 0 ++ * +-------+-------------+ ++ * With previosu simplified routines, after overwiting part of E2 with S, ++ * the bad blocks table should be (E3 is remaining part of E2 which is not ++ * overwritten by S), ++ * acknowledged ++ * +-------+--------+----+ S: 1 ++ * | E1 | S | E3 | E1: 1 ++ * +-------+--------+----+ E3: 0 ++ * The above result is correct but not perfect. Range E1 and S in the bad ++ * blocks table are all acked, merging them into a larger one range may ++ * occupy less bad blocks table space and make badblocks_check() faster. ++ * Therefore in such situation, after overwiting range S, the previous range ++ * E1 should be checked for possible front combination. Then the ideal ++ * result can be, ++ * +----------------+----+ acknowledged ++ * | E1 | E3 | E1: 1 ++ * +----------------+----+ E3: 0 ++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting ++ * range S and they are adjacent. Normally we don't need to care about this ++ * because front merge handles this while going though range S from head to ++ * tail, except for the tail part of range S. When the setting range S are ++ * fully handled, all the above simplified routine doesn't check whether the ++ * tail LBA of range S is adjacent to the next already set range and not able ++ * to them if they are mergeable. ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * For the above special stiuation, when the setting range S are all handled ++ * and the loop ends, an extra check is necessary for whether next already ++ * set range E is right after S and mergeable. ++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can behind merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range ++ * S infront of the already set range E in the bad blocks table. The result ++ * is, ++ * +------+-------+ ++ * | S | E | ++ * +------+-------+ ++ * ++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of ++ * the bad block range setting conditions. Maybe there is some rare corner case ++ * is not considered and optimized, it won't hurt if badblocks_set() fails due ++ * to no space, or some ranges are not merged to save bad blocks table space. ++ * ++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every ++ * time for the new loop prev_badblocks() is called to find an already set range ++ * which starts before or at current setting range. Since the setting bad blocks ++ * range is handled from head to tail, most of the cases it is unnecessary to do ++ * the binary search inside prev_badblocks(), it is possible to provide a hint ++ * to prev_badblocks() for a fast path, then the expensive binary search can be ++ * avoided. In my test with the hint to prev_badblocks(), except for the first ++ * loop, all rested calls to prev_badblocks() can go into the fast path and ++ * return correct bad blocks table index immediately. ++ */ ++ + /* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad +@@ -390,6 +706,231 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad + return len; + } + ++static void badblocks_update_acked(struct badblocks *bb) ++{ ++ u64 *p = bb->page; ++ int i; ++ bool unacked = false; ++ ++ if (!bb->unacked_exist) ++ return; ++ ++ for (i = 0; i < bb->count ; i++) { ++ if (!BB_ACK(p[i])) { ++ unacked = true; ++ break; ++ } ++ } ++ ++ if (!unacked) ++ bb->unacked_exist = 0; ++} ++ ++/* Do exact work to set bad block range into the bad block table */ ++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged) ++{ ++ u64 *p; ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ int len = 0, added = 0; ++ int retried = 0, space_desired = 0; ++ int rv = 0; ++ unsigned long flags; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ /* round the start down, and the end up */ ++ sector_t next = s + sectors; ++ ++ rounddown(s, bb->shift); ++ roundup(next, bb->shift); ++ sectors = next - s; ++ } ++ ++ write_seqlock_irqsave(&bb->lock, flags); ++ ++ bad.orig_start = s; ++ bad.orig_len = sectors; ++ bad.ack = acknowledged; ++ p = bb->page; ++ ++re_insert: ++ bad.start = s; ++ bad.len = sectors; ++ len = 0; ++ ++ if (badblocks_empty(bb)) { ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start before all badblocks */ ++ if (prev < 0) { ++ if (!badblocks_full(bb)) { ++ /* insert on the first */ ++ if (bad.len > (BB_OFFSET(p[0]) - bad.start)) ++ bad.len = BB_OFFSET(p[0]) - bad.start; ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* No sapce, try to merge */ ++ if (overlap_behind(bb, &bad, 0)) { ++ if (can_merge_behind(bb, &bad, 0)) { ++ len = behind_merge(bb, &bad, 0); ++ added++; ++ } else { ++ len = min_t(sector_t, ++ BB_OFFSET(p[0]) - s, sectors); ++ space_desired = 1; ++ } ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* no table space and give up */ ++ goto out; ++ } ++ ++ /* in case p[prev-1] can be merged with p[prev] */ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ hint = prev; ++ } else { ++ int extra = 0; ++ ++ if (!can_front_overwrite(bb, prev, &bad, &extra)) { ++ len = min_t(sector_t, ++ BB_END(p[prev]) - s, sectors); ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ len = front_overwrite(bb, prev, &bad, extra); ++ added++; ++ bb->count += extra; ++ hint = prev; ++ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ hint = prev - 1; ++ } ++ } ++ goto update_sectors; ++ } ++ ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ /* if no space in table, still try to merge in the covered range */ ++ if (badblocks_full(bb)) { ++ /* skip the cannot-merge range */ ++ if (((prev + 1) < bb->count) && ++ overlap_behind(bb, &bad, prev + 1) && ++ ((s + sectors) >= BB_END(p[prev + 1]))) { ++ len = BB_END(p[prev + 1]) - s; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* no retry any more */ ++ len = sectors; ++ space_desired = 1; ++ hint = -1; ++ goto update_sectors; ++ } ++ ++ /* cannot merge and there is space in bad table */ ++ if ((prev + 1) < bb->count && ++ overlap_behind(bb, &bad, prev + 1)) ++ bad.len = min_t(sector_t, ++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start); ++ ++ len = insert_at(bb, prev + 1, &bad); ++ bb->count++; ++ added++; ++ hint = prev + 1; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_insert; ++ ++ WARN_ON(sectors < 0); ++ ++ /* Check whether the following already set range can be merged */ ++ if ((prev + 1) < bb->count && ++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && ++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && ++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), ++ BB_ACK(p[prev])); ++ ++ if ((prev + 2) < bb->count) ++ memmove(p + prev + 1, p + prev + 2, ++ (bb->count - (prev + 2)) * 8); ++ bb->count--; ++ } ++ ++ if (space_desired && !badblocks_full(bb)) { ++ s = bad.orig_start; ++ sectors = bad.orig_len; ++ space_desired = 0; ++ if (retried++ < 3) ++ goto re_insert; ++ } ++ ++out: ++ if (added) { ++ set_changed(bb); ++ ++ if (!acknowledged) ++ bb->unacked_exist = 1; ++ else ++ badblocks_update_acked(bb); ++ } ++ ++ write_sequnlock_irqrestore(&bb->lock, flags); ++ ++ if (!added) ++ rv = 1; ++ ++ return rv; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +@@ -499,26 +1040,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + } + EXPORT_SYMBOL_GPL(badblocks_check); + +-static void badblocks_update_acked(struct badblocks *bb) +-{ +- u64 *p = bb->page; +- int i; +- bool unacked = false; +- +- if (!bb->unacked_exist) +- return; +- +- for (i = 0; i < bb->count ; i++) { +- if (!BB_ACK(p[i])) { +- unacked = true; +- break; +- } +- } +- +- if (!unacked) +- bb->unacked_exist = 0; +-} +- + /** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch new file mode 100644 index 0000000..4cbfd5e --- /dev/null +++ b/for-test/badblocks/v4/backup/0004-badblocks-improve-badblocks_clear-for-multiple-range.patch @@ -0,0 +1,401 @@ +From b75e0792f127a99f068d635421ffac52843b488c Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 22:16:10 +0800 +Subject: [PATCH 4/6] badblocks: improve badblocks_clear() for multiple ranges + handling + +With the foundamental ideas and helper routines from badblocks_set() +improvement, clearing bad block for multiple ranges is much simpler. + +With a similar idea from badblocks_set() improvement, this patch +simplifies bad block range clearing into 5 situations. No matter how +complicated the clearing condition is, we just look at the head part +of clearing range with relative already set bad block range from the +bad block table. The rested part will be handled in next run of the +while-loop. + +Based on existing helpers added from badblocks_set(), this patch adds +two more helpers, +- front_clear() + Clear the bad block range from bad block table which is front + overlapped with the clearing range. +- front_splitting_clear() + Handle the condition that the clearing range hits middle of an + already set bad block range from bad block table. + +Similar as badblocks_set(), the first part of clearing range is handled +with relative bad block range which is find by prev_badblocks(). In most +cases a valid hint is provided to prev_badblocks() to avoid unnecessary +bad block table iteration. + +This patch also explains the detail algorithm code comments at beginning +of badblocks.c, including which five simplified situations are categried +and how all the bad block range clearing conditions are handled by these +five situations. + +Again, in order to make the code review easier and avoid the code +changes mixed together, this patch does not modify badblock_clear() and +implement another routine called _badblock_clear() for the improvement. +Later patch will delete current code of badblock_clear() and make it as +a wrapper to _badblock_clear(), so the code change can be much clear for +review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 327 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index 95dceed0da3c..b9a4cd64b840 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -330,6 +330,123 @@ + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. ++ * ++ * ++ * Clearing a bad blocks range from the bad block table has similar idea as ++ * setting does, but much more simpler. The only thing needs to be noticed is ++ * when the clearning range hits middle of a bad block range, the existing bad ++ * block range will split into two, and one more item should be added into the ++ * bad block table. The simplified situations to beconsidered are, (The already ++ * set bad blocks ranges in bad block table are naming with prefix E, and the ++ * clearing bad blocks range is naming with prefix C) ++ * ++ * 1) A clearing range is not overlapped to any already set ranges in bad block ++ * table. ++ * +-----+ | +-----+ | +-----+ ++ * | C | | | C | | | C | ++ * +-----+ or +-----+ or +-----+ ++ * +---+ | +----+ +----+ | +---+ ++ * | E | | | E1 | | E2 | | | E | ++ * +---+ | +----+ +----+ | +---+ ++ * For the above situations, no bad block to be cleared and no failure ++ * happens, simply returns 0. ++ * 2) The clearing range hits middle of an already setting bad blocks range in ++ * the bad block table. ++ * +---+ ++ * | C | ++ * +---+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * In this situation if the bad block table is not full, the range E will be ++ * split into two ranges E1 and E2. The result is, ++ * +------+ +------+ ++ * | E1 | | E2 | ++ * +------+ +------+ ++ * 3) The clearing range starts exactly at same LBA as an already set bad block range ++ * from the bad block table. ++ * 3.1) Partially covered at head part ++ * +------------+ ++ * | C | ++ * +------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation, the overlapped already set range will update the ++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No ++ * item deleted from bad block table. The result is, ++ * +----+ ++ * | E1 | ++ * +----+ ++ * 3.2) Exact fully covered ++ * +-----------------+ ++ * | C | ++ * +-----------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation the whole bad blocks range E will be cleared and its ++ * corresponded item is deleted from the bad block table. ++ * 4) The clearing range exactly ends at same LBA as an already set bad block ++ * range. ++ * +-------+ ++ * | C | ++ * +-------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For the above situation, the already set range E is updated to shrink its ++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). ++ * The result is, ++ * +---------+ ++ * | E | ++ * +---------+ ++ * 5) The clearing range is partially overlapped with an already set bad block ++ * range from the bad block table. ++ * 5.1) The already set bad block range is front overlapped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part ends at the start LBA of range E, and the second part starts at ++ * same LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be ++ * handled as condition 3.1) in next loop. ++ * 5.2) The already set bad block range is behind overlaopped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part C1 ends at same end LBA of range E, and the second part starts ++ * at end LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part clearing range C1 can be handled as condition 4), and ++ * the second part clearing range C2 can be handled as condition 1) in next ++ * loop. ++ * ++ * All bad blocks range clearing can be simplified into the above 5 situations ++ * by only handling the head part of the clearing range in each run of the ++ * while-loop. The idea is similar to bad blocks range setting but much ++ * simpler. + */ + + /* +@@ -931,6 +1048,216 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + return rv; + } + ++/* ++ * Clear the bad block range from bad block table which is front overlapped ++ * with the clearing range. The return value is how many sectors from an ++ * already set bad block range are cleared. If the whole bad block range is ++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for ++ * the caller to reduce bb->count. ++ */ ++static int front_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *deleted) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int cleared = 0; ++ ++ *deleted = 0; ++ if (s == BB_OFFSET(p[prev])) { ++ if (BB_LEN(p[prev]) > sectors) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, ++ BB_LEN(p[prev]) - sectors, ++ BB_ACK(p[prev])); ++ cleared = sectors; ++ } else { ++ /* BB_LEN(p[prev]) <= sectors */ ++ cleared = BB_LEN(p[prev]); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ *deleted = 1; ++ } ++ } else if (s > BB_OFFSET(p[prev])) { ++ if (BB_END(p[prev]) <= (s + sectors)) { ++ cleared = BB_END(p[prev]) - s; ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ } else { ++ /* Splitting is handled in front_splitting_clear() */ ++ BUG(); ++ } ++ } ++ ++ return cleared; ++} ++ ++/* ++ * Handle the condition that the clearing range hits middle of an already set ++ * bad block range from bad block table. In this condition the existing bad ++ * block range is split into two after the middle part is cleared. ++ */ ++static int front_splitting_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ u64 end = BB_END(p[prev]); ++ int ack = BB_ACK(p[prev]); ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ ack); ++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); ++ return sectors; ++} ++ ++/* Do the exact work to clear bad block range from the bad block table */ ++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) ++{ ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ int len = 0, cleared = 0; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ sector_t target; ++ ++ /* When clearing we round the start up and the end down. ++ * This should not matter as the shift should align with ++ * the block size and no rounding should ever be needed. ++ * However it is better the think a block is bad when it ++ * isn't than to think a block is not bad when it is. ++ */ ++ target = s + sectors; ++ roundup(s, bb->shift); ++ rounddown(target, bb->shift); ++ sectors = target - s; ++ } ++ ++ write_seqlock_irq(&bb->lock); ++ ++ bad.orig_start = s; ++ bad.orig_len = sectors; ++ bad.ack = true; ++ p = bb->page; ++ ++re_clear: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* Start before all badblocks */ ++ if (prev < 0) { ++ if (overlap_behind(bb, &bad, 0)) { ++ len = BB_OFFSET(p[0]) - s; ++ hint = prev; ++ } else { ++ len = sectors; ++ } ++ /* ++ * Both situations are to clear non-bad range, ++ * should be treated as successful ++ */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Clear will split a bad record but the table is full */ ++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + sectors))) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if ((BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + bad.len))) { ++ /* Splitting */ ++ if ((bb->count + 1) < MAX_BADBLOCKS) { ++ len = front_splitting_clear(bb, prev, &bad); ++ bb->count += 1; ++ cleared++; ++ } else { ++ /* No space to split, give up */ ++ len = sectors; ++ } ++ } else { ++ int deleted = 0; ++ ++ len = front_clear(bb, prev, &bad, &deleted); ++ bb->count -= deleted; ++ cleared++; ++ hint = prev; ++ } ++ ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Not cover any badblocks range in the table */ ++ len = sectors; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_clear; ++ ++ WARN_ON(sectors < 0); ++ ++ if (cleared) { ++ badblocks_update_acked(bb); ++ set_changed(bb); ++ } ++ ++ write_sequnlock_irq(&bb->lock); ++ ++ if (!cleared) ++ rv = 1; ++ ++ return rv; ++} ++ ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch new file mode 100644 index 0000000..6be1249 --- /dev/null +++ b/for-test/badblocks/v4/backup/0005-badblocks-improve-badblocks_check-for-multiple-range.patch @@ -0,0 +1,177 @@ +From 09092ea11f2a8d319ac57865031190f153d159ae Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 2 Mar 2021 09:27:06 +0800 +Subject: [PATCH 5/6] badblocks: improve badblocks_check() for multiple ranges + handling + +This patch rewrites badblocks_check() with similar coding style as +_badblocks_set() and _badblocks_clear(). The only difference is bad +blocks checking may handle multiple ranges in bad tables now. + +If a checking range covers multiple bad blocks range in bad block table, +like the following condition (C is the checking range, E1, E2, E3 are +three bad block ranges in bad block table), + +------------------------------------+ + | C | + +------------------------------------+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +The improved badblocks_check() algorithm will divid checking range C +into multiple parts, and handle them in 7 runs of a while-loop, + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 | + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +And the start LBA and length of range E1 will be set as first_bad and +bad_sectors for the caller. + +The return value rule is consistent for multiple ranges. For example if +there are following bad block ranges in bad block table, + Index No. Start Len Ack + 0 400 20 1 + 1 500 50 1 + 2 650 20 0 +the return value, first_bad, bad_sectors by calling badblocks_set() with +different checking range can be the following values, + Checking Start, Len Return Value first_bad bad_sectors + 100, 100 0 N/A N/A + 100, 310 1 400 10 + 100, 440 1 400 10 + 100, 540 1 400 10 + 100, 600 -1 400 10 + 100, 800 -1 400 10 + +In order to make code review easier, this patch names the improved bad +block range checking routine as _badblocks_check() and does not change +existing badblock_check() code yet. Later patch will delete old code of +badblocks_check() and make it as a wrapper to call _badblocks_check(). +Then the new added code won't mess up with the old deleted code, it will +be more clear and easier for code review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 99 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index b9a4cd64b840..5a1ac35b924a 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1257,6 +1257,105 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + return rv; + } + ++/* Do the exact work to check bad blocks range from the bad block table */ ++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors) ++{ ++ int unacked_badblocks, acked_badblocks; ++ int prev = -1, hint = -1, set = 0; ++ struct badblocks_context bad; ++ unsigned int seq; ++ int len, rv; ++ u64 *p; ++ ++ WARN_ON(bb->shift < 0 || sectors == 0); ++ ++ if (bb->shift > 0) { ++ sector_t target; ++ ++ /* round the start down, and the end up */ ++ target = s + sectors; ++ rounddown(s, bb->shift); ++ roundup(target, bb->shift); ++ sectors = target - s; ++ } ++ ++retry: ++ seq = read_seqbegin(&bb->lock); ++ ++ bad.orig_start = s; ++ bad.orig_len = sectors; ++ p = bb->page; ++ unacked_badblocks = 0; ++ acked_badblocks = 0; ++ ++re_check: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (BB_ACK(p[prev])) ++ acked_badblocks++; ++ else ++ unacked_badblocks++; ++ ++ if (BB_END(p[prev]) >= (s + sectors)) ++ len = sectors; ++ else ++ len = BB_END(p[prev]) - s; ++ ++ if (set == 0) { ++ *first_bad = BB_OFFSET(p[prev]); ++ *bad_sectors = BB_LEN(p[prev]); ++ set = 1; ++ } ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* not cover any badblocks range in the table */ ++ len = sectors; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_check; ++ ++ WARN_ON(sectors < 0); ++ ++ if (unacked_badblocks > 0) ++ rv = -1; ++ else if (acked_badblocks > 0) ++ rv = 1; ++ else ++ rv = 0; ++ ++ if (read_seqretry(&bb->lock, seq)) ++ goto retry; ++ ++ return rv; ++} + + /** + * badblocks_check() - check a given range for bad sectors +-- +2.31.1 + diff --git a/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch new file mode 100644 index 0000000..6d07398 --- /dev/null +++ b/for-test/badblocks/v4/backup/0006-badblocks-switch-to-the-improved-badblock-handling-c.patch @@ -0,0 +1,364 @@ +From f81bac5e10aa50c8245c605c363f7d4de21e318a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 2 Mar 2021 10:48:43 +0800 +Subject: [PATCH 6/6] badblocks: switch to the improved badblock handling code + +This patch removes old code of badblocks_set(), badblocks_clear() and +badblocks_check(), and make them as wrappers to call _badblocks_set(), +_badblocks_clear() and _badblocks_check(). + +By this change now the badblock handing switch to the improved algorithm +in _badblocks_set(), _badblocks_clear() and _badblocks_check(). + +This patch only contains the changes of old code deletion, new added +code for the improved algorithms are in previous patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 310 +--------------------------------------------- + 1 file changed, 3 insertions(+), 307 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index 5a1ac35b924a..5ab03cfdc0b7 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1394,75 +1394,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) + { +- int hi; +- int lo; +- u64 *p = bb->page; +- int rv; +- sector_t target = s + sectors; +- unsigned seq; +- +- if (bb->shift > 0) { +- /* round the start down, and the end up */ +- s >>= bb->shift; +- target += (1<<bb->shift) - 1; +- target >>= bb->shift; +- sectors = target - s; +- } +- /* 'target' is now the first block after the bad range */ +- +-retry: +- seq = read_seqbegin(&bb->lock); +- lo = 0; +- rv = 0; +- hi = bb->count; +- +- /* Binary search between lo and hi for 'target' +- * i.e. for the last range that starts before 'target' +- */ +- /* INVARIANT: ranges before 'lo' and at-or-after 'hi' +- * are known not to be the last range before target. +- * VARIANT: hi-lo is the number of possible +- * ranges, and decreases until it reaches 1 +- */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- /* This could still be the one, earlier ranges +- * could not. +- */ +- lo = mid; +- else +- /* This and later ranges are definitely out. */ +- hi = mid; +- } +- /* 'lo' might be the last that started before target, but 'hi' isn't */ +- if (hi > lo) { +- /* need to check all range that end after 's' to see if +- * any are unacknowledged. +- */ +- while (lo >= 0 && +- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { +- if (BB_OFFSET(p[lo]) < target) { +- /* starts before the end, and finishes after +- * the start, so they must overlap +- */ +- if (rv != -1 && BB_ACK(p[lo])) +- rv = 1; +- else +- rv = -1; +- *first_bad = BB_OFFSET(p[lo]); +- *bad_sectors = BB_LEN(p[lo]); +- } +- lo--; +- } +- } +- +- if (read_seqretry(&bb->lock, seq)) +- goto retry; +- +- return rv; ++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + } + EXPORT_SYMBOL_GPL(badblocks_check); + +@@ -1484,154 +1416,7 @@ EXPORT_SYMBOL_GPL(badblocks_check); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) + { +- u64 *p; +- int lo, hi; +- int rv = 0; +- unsigned long flags; +- +- if (bb->shift < 0) +- /* badblocks are disabled */ +- return 1; +- +- if (bb->shift) { +- /* round the start down, and the end up */ +- sector_t next = s + sectors; +- +- s >>= bb->shift; +- next += (1<<bb->shift) - 1; +- next >>= bb->shift; +- sectors = next - s; +- } +- +- write_seqlock_irqsave(&bb->lock, flags); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts at-or-before 's' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a <= s) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo && BB_OFFSET(p[lo]) > s) +- hi = lo; +- +- if (hi > lo) { +- /* we found a range that might merge with the start +- * of our new range +- */ +- sector_t a = BB_OFFSET(p[lo]); +- sector_t e = a + BB_LEN(p[lo]); +- int ack = BB_ACK(p[lo]); +- +- if (e >= s) { +- /* Yes, we can merge with a previous range */ +- if (s == a && s + sectors >= e) +- /* new range covers old */ +- ack = acknowledged; +- else +- ack = ack && acknowledged; +- +- if (e < s + sectors) +- e = s + sectors; +- if (e - a <= BB_MAX_LEN) { +- p[lo] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- /* does not all fit in one range, +- * make p[lo] maximal +- */ +- if (BB_LEN(p[lo]) != BB_MAX_LEN) +- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- } +- } +- if (sectors && hi < bb->count) { +- /* 'hi' points to the first range that starts after 's'. +- * Maybe we can merge with the start of that range +- */ +- sector_t a = BB_OFFSET(p[hi]); +- sector_t e = a + BB_LEN(p[hi]); +- int ack = BB_ACK(p[hi]); +- +- if (a <= s + sectors) { +- /* merging is possible */ +- if (e <= s + sectors) { +- /* full overlap */ +- e = s + sectors; +- ack = acknowledged; +- } else +- ack = ack && acknowledged; +- +- a = s; +- if (e - a <= BB_MAX_LEN) { +- p[hi] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- lo = hi; +- hi++; +- } +- } +- if (sectors == 0 && hi < bb->count) { +- /* we might be able to combine lo and hi */ +- /* Note: 's' is at the end of 'lo' */ +- sector_t a = BB_OFFSET(p[hi]); +- int lolen = BB_LEN(p[lo]); +- int hilen = BB_LEN(p[hi]); +- int newlen = lolen + hilen - (s - a); +- +- if (s >= a && newlen < BB_MAX_LEN) { +- /* yes, we can combine them */ +- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); +- +- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); +- memmove(p + hi, p + hi + 1, +- (bb->count - hi - 1) * 8); +- bb->count--; +- } +- } +- while (sectors) { +- /* didn't merge (it all). +- * Need to add a range just before 'hi' +- */ +- if (bb->count >= MAX_BADBLOCKS) { +- /* No room for more */ +- rv = 1; +- break; +- } else { +- int this_sectors = sectors; +- +- memmove(p + hi + 1, p + hi, +- (bb->count - hi) * 8); +- bb->count++; +- +- if (this_sectors > BB_MAX_LEN) +- this_sectors = BB_MAX_LEN; +- p[hi] = BB_MAKE(s, this_sectors, acknowledged); +- sectors -= this_sectors; +- s += this_sectors; +- } +- } +- +- bb->changed = 1; +- if (!acknowledged) +- bb->unacked_exist = 1; +- else +- badblocks_update_acked(bb); +- write_sequnlock_irqrestore(&bb->lock, flags); +- +- return rv; ++ return _badblocks_set(bb, s, sectors, acknowledged); + } + EXPORT_SYMBOL_GPL(badblocks_set); + +@@ -1651,96 +1436,7 @@ EXPORT_SYMBOL_GPL(badblocks_set); + */ + int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + { +- u64 *p; +- int lo, hi; +- sector_t target = s + sectors; +- int rv = 0; +- +- if (bb->shift > 0) { +- /* When clearing we round the start up and the end down. +- * This should not matter as the shift should align with +- * the block size and no rounding should ever be needed. +- * However it is better the think a block is bad when it +- * isn't than to think a block is not bad when it is. +- */ +- s += (1<<bb->shift) - 1; +- s >>= bb->shift; +- target >>= bb->shift; +- sectors = target - s; +- } +- +- write_seqlock_irq(&bb->lock); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts before 'target' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo) { +- /* p[lo] is the last range that could overlap the +- * current range. Earlier ranges could also overlap, +- * but only this one can overlap the end of the range. +- */ +- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && +- (BB_OFFSET(p[lo]) < target)) { +- /* Partial overlap, leave the tail of this range */ +- int ack = BB_ACK(p[lo]); +- sector_t a = BB_OFFSET(p[lo]); +- sector_t end = a + BB_LEN(p[lo]); +- +- if (a < s) { +- /* we need to split this range */ +- if (bb->count >= MAX_BADBLOCKS) { +- rv = -ENOSPC; +- goto out; +- } +- memmove(p+lo+1, p+lo, (bb->count - lo) * 8); +- bb->count++; +- p[lo] = BB_MAKE(a, s-a, ack); +- lo++; +- } +- p[lo] = BB_MAKE(target, end - target, ack); +- /* there is no longer an overlap */ +- hi = lo; +- lo--; +- } +- while (lo >= 0 && +- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && +- (BB_OFFSET(p[lo]) < target)) { +- /* This range does overlap */ +- if (BB_OFFSET(p[lo]) < s) { +- /* Keep the early parts of this range. */ +- int ack = BB_ACK(p[lo]); +- sector_t start = BB_OFFSET(p[lo]); +- +- p[lo] = BB_MAKE(start, s - start, ack); +- /* now low doesn't overlap, so.. */ +- break; +- } +- lo--; +- } +- /* 'lo' is strictly before, 'hi' is strictly after, +- * anything between needs to be discarded +- */ +- if (hi - lo > 1) { +- memmove(p+lo+1, p+hi, (bb->count - hi) * 8); +- bb->count -= (hi - lo - 1); +- } +- } +- +- badblocks_update_acked(bb); +- bb->changed = 1; +-out: +- write_sequnlock_irq(&bb->lock); +- return rv; ++ return _badblocks_clear(bb, s, sectors); + } + EXPORT_SYMBOL_GPL(badblocks_clear); + +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0000-cover-letter.patch b/for-test/badblocks/v4/v4-0000-cover-letter.patch new file mode 100644 index 0000000..c02f896 --- /dev/null +++ b/for-test/badblocks/v4/v4-0000-cover-letter.patch @@ -0,0 +1,70 @@ +From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 19:05:12 +0800 +Subject: [RESEND PATCH v4 0/6] badblocks improvement for multiple bad block ranges + +Hi Dan, + +This is the v4 effort to improve badblocks code APIs to handle multiple +ranges in bad block table. + +Comparing to v3 series, the v4 series modification is for code review +comments from Geliang Tang, +- Declare local variables in reverse Xmas tree order. +- Drop orig_start and orig_len from struct badblocks_context. +- Fix typos in code comments. +- in badblocks_set() avoid one unnecessary loop by setting variable + hint by prev (was prev - 1 in v3 series). + +There is NO in-memory or on-disk format change in the whole series, all +existing API and data structures are consistent. This series just only +improve the code algorithm to handle more corner cases, the interfaces +are same and consistency to all existing callers (md raid and nvdimm +drivers). + +The original motivation of the change is from the requirement from our +customer, that current badblocks routines don't handle multiple ranges. +For example if the bad block setting range covers multiple ranges from +bad block table, only the first two bad block ranges merged and rested +ranges are intact. The expected behavior should be all the covered +ranges to be handled. + +All the patches are tested by modified user space code and the code +logic works as expected. The modified user space testing code is +provided in last patch. The testing code is an example how the improved +code is tested. + +The whole change is divided into 6 patches to make the code review more +clear and easier. If people prefer, I'd like to post a single large +patch finally after the code review accomplished. + +Please review the code and response. Thank you all in advance. + +Coly Li + +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Richard Fan <richard.fan@suse.com> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + +Coly Li (6): + badblocks: add more helper structure and routines in badblocks.h + badblocks: add helper routines for badblock ranges handling + badblocks: improvement badblocks_set() for multiple ranges handling + badblocks: improve badblocks_clear() for multiple ranges handling + badblocks: improve badblocks_check() for multiple ranges handling + badblocks: switch to the improved badblock handling code +Coly Li (1): + test: user space code to test badblocks APIs + + block/badblocks.c | 1602 ++++++++++++++++++++++++++++++------- + include/linux/badblocks.h | 30 + + 2 files changed, 1337 insertions(+), 295 deletions(-) + +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch new file mode 100644 index 0000000..f008556 --- /dev/null +++ b/for-test/badblocks/v4/v4-0001-badblocks-add-more-helper-structure-and-routines-.patch @@ -0,0 +1,91 @@ +From 4b3441cc612192914fdf57a8ae3f71479ff3793f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 15:29:38 +0800 +Subject: [PATCH v4 1/6] badblocks: add more helper structure and routines in + badblocks.h + +This patch adds the following helper structure and routines into +badblocks.h, +- struct badblocks_context + This structure is used in improved badblocks code for bad table + iteration. +- BB_END() + The macro to calculate end LBA of a bad range record from bad + table. +- badblocks_full() and badblocks_empty() + The inline routines to check whether bad table is full or empty. +- set_changed() and clear_changed() + The inline routines to set and clear 'changed' tag from struct + badblocks. + +These new helper structure and routines can help to make the code more +clear, they will be used in the improved badblocks code in following +patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h +index 2426276b9bd3..670f2dae692f 100644 +--- a/include/linux/badblocks.h ++++ b/include/linux/badblocks.h +@@ -15,6 +15,7 @@ + #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) + #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) + #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) ++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x)) + #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + + /* Bad block numbers are stored sorted in a single page. +@@ -41,6 +42,12 @@ struct badblocks { + sector_t size; /* in sectors */ + }; + ++struct badblocks_context { ++ sector_t start; ++ sector_t len; ++ int ack; ++}; ++ + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, +@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) + } + badblocks_exit(bb); + } ++ ++static inline int badblocks_full(struct badblocks *bb) ++{ ++ return (bb->count >= MAX_BADBLOCKS); ++} ++ ++static inline int badblocks_empty(struct badblocks *bb) ++{ ++ return (bb->count == 0); ++} ++ ++static inline void set_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 1) ++ bb->changed = 1; ++} ++ ++static inline void clear_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 0) ++ bb->changed = 0; ++} ++ + #endif +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch new file mode 100644 index 0000000..46116bb --- /dev/null +++ b/for-test/badblocks/v4/v4-0002-badblocks-add-helper-routines-for-badblock-ranges.patch @@ -0,0 +1,457 @@ +From 69aa03e6aa9eb441a3b4bc7c3d017c064d6d821b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 17:16:57 +0800 +Subject: [PATCH v4 2/6] badblocks: add helper routines for badblock ranges + handling + +This patch adds several helper routines to improve badblock ranges +handling. These helper routines will be used later in the improved +version of badblocks_set()/badblocks_clear()/badblocks_check(). + +- Helpers prev_by_hint() and prev_badblocks() are used to find the bad + range from bad table which the searching range starts at or after. + +- The following helpers are to decide the relative layout between the + manipulating range and existing bad block range from bad table. + - can_merge_behind() + Return 'true' if the manipulating range can backward merge with the + bad block range. + - can_merge_front() + Return 'true' if the manipulating range can forward merge with the + bad block range. + - can_combine_front() + Return 'true' if two adjacent bad block ranges before the + manipulating range can be merged. + - overlap_front() + Return 'true' if the manipulating range exactly overlaps with the + bad block range in front of its range. + - overlap_behind() + Return 'true' if the manipulating range exactly overlaps with the + bad block range behind its range. + - can_front_overwrite() + Return 'true' if the manipulating range can forward overwrite the + bad block range in front of its range. + +- The following helpers are to add the manipulating range into the bad + block table. Different routine is called with the specific relative + layout between the manipulating range and other bad block range in the + bad block table. + - behind_merge() + Merge the manipulating range with the bad block range behind its + range, and return the number of merged length in unit of sector. + - front_merge() + Merge the manipulating range with the bad block range in front of + its range, and return the number of merged length in unit of sector. + - front_combine() + Combine the two adjacent bad block ranges before the manipulating + range into a larger one. + - front_overwrite() + Overwrite partial of whole bad block range which is in front of the + manipulating range. The overwrite may split existing bad block range + and generate more bad block ranges into the bad block table. + - insert_at() + Insert the manipulating range at a specific location in the bad + block table. + +All the above helpers are used in later patches to improve the bad block +ranges handling for badblocks_set()/badblocks_clear()/badblocks_check(). + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 374 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 374 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index d39056630d9c..e216c6791b4b 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,380 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * Find the range starts at-or-before 's' from bad table. The search ++ * starts from index 'hint' and stops at index 'hint_end' from the bad ++ * table. ++ */ ++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) ++{ ++ int hint_end = hint + 2; ++ u64 *p = bb->page; ++ int ret = -1; ++ ++ while ((hint < hint_end) && ((hint + 1) <= bb->count) && ++ (BB_OFFSET(p[hint]) <= s)) { ++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { ++ ret = hint; ++ break; ++ } ++ hint++; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Find the range starts at-or-before bad->start. If 'hint' is provided ++ * (hint >= 0) then search in the bad table from hint firstly. It is ++ * very probably the wanted bad range can be found from the hint index, ++ * then the unnecessary while-loop iteration can be avoided. ++ */ ++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, ++ int hint) ++{ ++ sector_t s = bad->start; ++ int ret = -1; ++ int lo, hi; ++ u64 *p; ++ ++ if (!bb->count) ++ goto out; ++ ++ if (hint >= 0) { ++ ret = prev_by_hint(bb, s, hint); ++ if (ret >= 0) ++ goto out; ++ } ++ ++ lo = 0; ++ hi = bb->count; ++ p = bb->page; ++ ++ while (hi - lo > 1) { ++ int mid = (lo + hi)/2; ++ sector_t a = BB_OFFSET(p[mid]); ++ ++ if (a <= s) ++ lo = mid; ++ else ++ hi = mid; ++ } ++ ++ if (BB_OFFSET(p[lo]) <= s) ++ ret = lo; ++out: ++ return ret; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be backward merged ++ * with the bad range (from the bad table) index by 'behind'. ++ */ ++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ ++ if ((s <= BB_OFFSET(p[behind])) && ++ ((s + sectors) >= BB_OFFSET(p[behind])) && ++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && ++ BB_ACK(p[behind]) == ack) ++ return true; ++ return false; ++} ++ ++/* ++ * Do backward merge for range indicated by 'bad' and the bad range ++ * (from the bad table) indexed by 'behind'. The return value is merged ++ * sectors from bad->len. ++ */ ++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_OFFSET(p[behind])); ++ WARN_ON((s + sectors) < BB_OFFSET(p[behind])); ++ ++ if (s < BB_OFFSET(p[behind])) { ++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); ++ ++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s); ++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, ack); ++ } else { ++ merged = min_t(sector_t, sectors, BB_LEN(p[behind])); ++ } ++ ++ WARN_ON(merged == 0); ++ ++ return merged; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be forward ++ * merged with the bad range (from the bad table) indexed by 'prev'. ++ */ ++static bool can_merge_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ ++ if (BB_ACK(p[prev]) == ack && ++ (s < BB_END(p[prev]) || ++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) ++ return true; ++ return false; ++} ++ ++/* ++ * Do forward merge for range indicated by 'bad' and the bad range ++ * (from bad table) indexed by 'prev'. The return value is sectors ++ * merged from bad->len. ++ */ ++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_END(p[prev])); ++ ++ if (s < BB_END(p[prev])) { ++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); ++ } else { ++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); ++ if ((prev + 1) < bb->count && ++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { ++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); ++ } ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + merged, ack); ++ } ++ ++ return merged; ++} ++ ++/* ++ * 'Combine' is a special case which can_merge_front() is not able to ++ * handle: If a bad range (indexed by 'prev' from bad table) exactly ++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by ++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and ++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then ++ * these two bad range (from bad table) can be combined. ++ * ++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad ++ * table can be combined. ++ */ ++static bool can_combine_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if ((prev > 0) && ++ (BB_OFFSET(p[prev]) == bad->start) && ++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && ++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && ++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) ++ return true; ++ return false; ++} ++ ++/* ++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad ++ * table) into one larger bad range, and the new range is indexed by ++ * 'prev - 1'. ++ */ ++static void front_combine(struct badblocks *bb, int prev) ++{ ++ u64 *p = bb->page; ++ ++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), ++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), ++ BB_ACK(p[prev])); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly forward ++ * overlapped with the bad range (from bad table) indexed by 'front'. ++ * Exactly forward overlap means the bad range (from bad table) indexed ++ * by 'prev' does not cover the whole range indicated by 'bad'. ++ */ ++static bool overlap_front(struct badblocks *bb, int front, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start >= BB_OFFSET(p[front]) && ++ bad->start < BB_END(p[front])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly backward ++ * overlapped with the bad range (from bad table) indexed by 'behind'. ++ */ ++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start < BB_OFFSET(p[behind]) && ++ (bad->start + bad->len) > BB_OFFSET(p[behind])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can overwrite the bad ++ * range (from bad table) indexed by 'prev'. ++ * ++ * The range indicated by 'bad' can overwrite the bad range indexed by ++ * 'prev' when, ++ * 1) The whole range indicated by 'bad' can cover partial or whole bad ++ * range (from bad table) indexed by 'prev'. ++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad ++ * range 'prev'. ++ * ++ * If the overwriting doesn't cover the whole bad range (from bad table) ++ * indexed by 'prev', new range might be split from existing bad range, ++ * 1) The overwrite covers head or tail part of existing bad range, 1 ++ * extra bad range will be split and added into the bad table. ++ * 2) The overwrite covers middle of existing bad range, 2 extra bad ++ * ranges will be split (ahead and after the overwritten range) and ++ * added into the bad table. ++ * The number of extra split ranges of the overwriting is stored in ++ * 'extra' and returned for the caller. ++ */ ++static bool can_front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *extra) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(!overlap_front(bb, prev, bad)); ++ ++ if (BB_ACK(p[prev]) >= bad->ack) ++ return false; ++ ++ if (BB_END(p[prev]) <= (bad->start + bad->len)) { ++ len = BB_END(p[prev]) - bad->start; ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 0; ++ else ++ *extra = 1; ++ ++ bad->len = len; ++ } else { ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 1; ++ else ++ /* ++ * prev range will be split into two, beside the overwritten ++ * one, an extra slot needed from bad table. ++ */ ++ *extra = 2; ++ } ++ ++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Do the overwrite from the range indicated by 'bad' to the bad range ++ * (from bad table) indexed by 'prev'. ++ * The previously called can_front_overwrite() will provide how many ++ * extra bad range(s) might be split and added into the bad table. All ++ * the splitting cases in the bad table will be handled here. ++ */ ++static int front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int extra) ++{ ++ u64 *p = bb->page; ++ sector_t orig_end = BB_END(p[prev]); ++ int orig_ack = BB_ACK(p[prev]); ++ int n = extra; ++ ++ switch (extra) { ++ case 0: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), ++ bad->ack); ++ break; ++ case 1: ++ if (BB_OFFSET(p[prev]) == bad->start) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->len, bad->ack); ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start + bad->len, ++ orig_end - BB_END(p[prev]), ++ orig_ack); ++ } else { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ memmove(p + prev + 1 + n, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ } ++ break; ++ case 2: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ memmove(p + prev + 1 + n, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), ++ orig_end - BB_END(p[prev + 1]), ++ BB_ACK(p[prev])); ++ break; ++ default: ++ break; ++ } ++ ++ return bad->len; ++} ++ ++/* ++ * Explicitly insert a range indicated by 'bad' to the bad table, where ++ * the location is indexed by 'at'. ++ */ ++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ int ack = bad->ack; ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(badblocks_full(bb)); ++ ++ len = min_t(sector_t, sectors, BB_MAX_LEN); ++ if (at < bb->count) ++ memmove(p + at + 1, p + at, (bb->count - at) * 8); ++ p[at] = BB_MAKE(s, len, ack); ++ ++ return len; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch new file mode 100644 index 0000000..cd732d0 --- /dev/null +++ b/for-test/badblocks/v4/v4-0003-badblocks-improvement-badblocks_set-for-multiple-.patch @@ -0,0 +1,661 @@ +From c6d337537fae982c4d24ce626436e32a2f71e5f8 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 15:57:50 +0800 +Subject: [PATCH v4 3/6] badblocks: improve badblocks_set() for multiple ranges handling + +Recently I received a bug report that current badblocks code does not +properly handle multiple ranges. For example, + badblocks_set(bb, 32, 1, true); + badblocks_set(bb, 34, 1, true); + badblocks_set(bb, 36, 1, true); + badblocks_set(bb, 32, 12, true); +Then indeed badblocks_show() reports, + 32 3 + 36 1 +But the expected bad blocks table should be, + 32 12 +Obviously only the first 2 ranges are merged and badblocks_set() returns +and ignores the rest setting range. + +This behavior is improper, if the caller of badblocks_set() wants to set +a range of blocks into bad blocks table, all of the blocks in the range +should be handled even the previous part encountering failure. + +The desired way to set bad blocks range by badblocks_set() is, +- Set as many as blocks in the setting range into bad blocks table. +- Merge the bad blocks ranges and occupy as less as slots in the bad + blocks table. +- Fast. + +Indeed the above proposal is complicated, especially with the following +restrictions, +- The setting bad blocks range can be acknowledged or not acknowledged. +- The bad blocks table size is limited. +- Memory allocation should be avoided. + +The basic idea of the patch is to categorize all possible bad blocks +range setting combinations into to much less simplified and more less +special conditions. Inside badblocks_set() there is an implicit loop +composed by jumping between labels 're_insert' and 'update_sectors'. No +matter how large the setting bad blocks range is, in every loop just a +minimized range from the head is handled by a pre-defined behavior from +one of the categorized conditions. The logic is simple and code flow is +manageable. + +The different relative layout between the setting range and existing bad +block range are checked and handled (merge, combine, overwrite, insert) +by the helpers in previous patch. This patch is to make all the helpers +work together with the above idea. + +This patch only has the algorithm improvement for badblocks_set(). There +are following patches contain improvement for badblocks_clear() and +badblocks_check(). But the algorithm in badblocks_set() is fundamental +and typical, other improvement in clear and check routines are based on +all the helpers and ideas in this patch. + +In order to make the change to be more clear for code review, this patch +does not directly modify existing badblocks_set(), and just add a new +one named _badblocks_set(). Later patch will remove current existing +badblocks_set() code and make it as a wrapper of _badblocks_set(). So +the new added change won't be mixed with deleted code, the code review +can be easier. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 540 insertions(+), 20 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index e216c6791b4b..13eaad18be15 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,322 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are ++ * identified by LBA addresses. ++ * ++ * When the caller of badblocks_set() wants to set a range of bad blocks, the ++ * setting range can be acked or unacked. And the setting range may merge, ++ * overwrite, skip the overlapped already set range, depends on who they are ++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be ++ * more complicated when the setting range covers multiple already set bad block ++ * ranges, with restrictions of maximum length of each bad range and the bad ++ * table space limitation. ++ * ++ * It is difficult and unnecessary to take care of all the possible situations, ++ * for setting a large range of bad blocks, we can handle it by dividing the ++ * large range into smaller ones when encounter overlap, max range length or ++ * bad table full conditions. Every time only a smaller piece of the bad range ++ * is handled with a limited number of conditions how it is interacted with ++ * possible overlapped or adjacent already set bad block ranges. Then the hard ++ * complicated problem can be much simpler to handle in proper way. ++ * ++ * When setting a range of bad blocks to the bad table, the simplified situations ++ * to be considered are, (The already set bad blocks ranges are naming with ++ * prefix E, and the setting bad blocks range is naming with prefix S) ++ * ++ * 1) A setting range is not overlapped or adjacent to any other already set bad ++ * block range. ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ +-------------+ ++ * | E1 | | E2 | ++ * +-------------+ +-------------+ ++ * For this situation if the bad blocks table is not full, just allocate a ++ * free slot from the bad blocks table to mark the setting range S. The ++ * result is, ++ * +-------------+ +--------+ +-------------+ ++ * | E1 | | S | | E2 | ++ * +-------------+ +--------+ +-------------+ ++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks ++ * range. ++ * 2.1) The setting range size < already set range size ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. ++ * An extra slot from the bad blocks table will be allocated for S, and head ++ * of E will move to end of the inserted range S. The result is, ++ * +--------+----+ ++ * | S | E | ++ * +--------+----+ ++ * 2.2) The setting range size == already set range size ++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of ++ bad blocks range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.3) The setting range size > already set range size ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For such situation, the setting range S can be treated as two parts, the ++ * first part (S1) is as same size as the already set range E, the second ++ * part (S2) is the rest of setting range. ++ * +-------------+-----+ +-------------+ +-----+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +-------------+-----+ ===> +-------------+ +-----+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now we only focus on how to handle the setting range S1 and already set ++ * range E, which are already explained in 2.2), for the rest S2 it will be ++ * handled later in next loop. ++ * 3) A setting range starts before the start LBA of an already set bad blocks ++ * range. ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation, the setting range S can be divided into two parts, the ++ * first (S1) ends at the start LBA of already set range E, the second part ++ * (S2) starts exactly at a start LBA of the already set range E. ++ * +----+---------+ +----+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +----+---------+ ===> +----+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now only the first part S1 should be handled in this loop, which is in ++ * similar condition as 1). The rest part S2 has exact same start LBA address ++ * of the already set range E, they will be handled in next loop in one of ++ * situations in 2). ++ * 4) A setting range starts after the start LBA of an already set bad blocks ++ * range. ++ * 4.1) If the setting range S exactly matches the tail part of already set bad ++ * blocks range E, like the following chart shows, ++ * +---------+ ++ * | S | ++ * +---------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may ++ * overwrite the overlapped range of E, the result is, ++ * +---+---------+ ++ * | E | S | ++ * +---+---------+ ++ * 4.2) If the setting range S stays in middle of an already set range E, like ++ * the following chart shows, ++ * +----+ ++ * | S | ++ * +----+ ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is also, ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will ++ * inserted into middle of E and split previous range E into twp parts (E1 ++ * and E2), the result is, ++ * +----+----+----+ ++ * | E1 | S | E2 | ++ * +----+----+----+ ++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad ++ * blocks range E. The range S starts after the start LBA of range E, and ++ * ends after the end LBA of range E, as the following chart shows, ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation the range S can be divided into two parts, the first ++ * part (S1) ends at end range E, and the second part (S2) has rest range of ++ * origin S. ++ * +---------+---------+ +---------+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +---------+---------+ ===> +---------+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now in this loop the setting range S1 and already set range E can be ++ * handled as the situations 4), the rest range S2 will be handled in next ++ * loop and ignored in this loop. ++ * 5) A setting bad blocks range S is adjacent to one or more already set bad ++ * blocks range(s), and they are all acked or unacked range. ++ * 5.1) Front merge: If the already set bad blocks range E is before setting ++ * range S and they are adjacent, ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can front merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting ++ * range S right after already set range E into the bad blocks table. The ++ * result is, ++ * +--------+------+ ++ * | E | S | ++ * +--------+------+ ++ * 6) Special cases which above conditions cannot handle ++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table ++ * +-------------------------------------------------------+ ++ * | S | ++ * +-------------------------------------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+ +-----+ +-----+ ++ * | E1 | | E2 | | E3 | ++ * +-----+ +-----+ +-----+ ++ * In the above example, when the bad blocks table is full, inserting the ++ * first part of setting range S will fail because no more available slot ++ * can be allocated from bad blocks table. In this situation a proper ++ * setting method should be go though all the setting bad blocks range and ++ * look for chance to merge already set ranges into less ones. When there ++ * is available slot from bad blocks table, re-try again to handle more ++ * setting bad blocks ranges as many as possible. ++ * +------------------------+ ++ * | S3 | ++ * +------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+-----+-----+---+-----+--+ ++ * | S1 | S2 | ++ * +-----+-----+-----+---+-----+--+ ++ * The above chart shows although the first part (S3) cannot be inserted due ++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges ++ * can be merged with rest part of S into less range S1 and S2. Now there is ++ * 1 free slot in bad blocks table. ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * | S3 | S1 | S2 | ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * Since the bad blocks table is not full anymore, re-try again for the ++ * origin setting range S. Now the setting range S3 can be inserted into the ++ * bad blocks table with previous freed slot from multiple ranges merge. ++ * 6.2) Front merge after overwrite ++ * In the following example, in bad blocks table, E1 is an acked bad blocks ++ * range and E2 is an unacked bad blocks range, therefore they are not able ++ * to merge into a larger range. The setting bad blocks range S is acked, ++ * therefore part of E2 can be overwritten by S. ++ * +--------+ ++ * | S | acknowledged ++ * +--------+ S: 1 ++ * +-------+-------------+ E1: 1 ++ * | E1 | E2 | E2: 0 ++ * +-------+-------------+ ++ * With previous simplified routines, after overwriting part of E2 with S, ++ * the bad blocks table should be (E3 is remaining part of E2 which is not ++ * overwritten by S), ++ * acknowledged ++ * +-------+--------+----+ S: 1 ++ * | E1 | S | E3 | E1: 1 ++ * +-------+--------+----+ E3: 0 ++ * The above result is correct but not perfect. Range E1 and S in the bad ++ * blocks table are all acked, merging them into a larger one range may ++ * occupy less bad blocks table space and make badblocks_check() faster. ++ * Therefore in such situation, after overwriting range S, the previous range ++ * E1 should be checked for possible front combination. Then the ideal ++ * result can be, ++ * +----------------+----+ acknowledged ++ * | E1 | E3 | E1: 1 ++ * +----------------+----+ E3: 0 ++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting ++ * range S and they are adjacent. Normally we don't need to care about this ++ * because front merge handles this while going though range S from head to ++ * tail, except for the tail part of range S. When the setting range S are ++ * fully handled, all the above simplified routine doesn't check whether the ++ * tail LBA of range S is adjacent to the next already set range and not able ++ * to them if they are mergeable. ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * For the above special situation, when the setting range S are all handled ++ * and the loop ends, an extra check is necessary for whether next already ++ * set range E is right after S and mergeable. ++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can behind merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range ++ * S in front of the already set range E in the bad blocks table. The result ++ * is, ++ * +------+-------+ ++ * | S | E | ++ * +------+-------+ ++ * ++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of ++ * the bad block range setting conditions. Maybe there is some rare corner case ++ * is not considered and optimized, it won't hurt if badblocks_set() fails due ++ * to no space, or some ranges are not merged to save bad blocks table space. ++ * ++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every ++ * time for the new loop prev_badblocks() is called to find an already set range ++ * which starts before or at current setting range. Since the setting bad blocks ++ * range is handled from head to tail, most of the cases it is unnecessary to do ++ * the binary search inside prev_badblocks(), it is possible to provide a hint ++ * to prev_badblocks() for a fast path, then the expensive binary search can be ++ * avoided. In my test with the hint to prev_badblocks(), except for the first ++ * loop, all rested calls to prev_badblocks() can go into the fast path and ++ * return correct bad blocks table index immediately. ++ */ ++ + /* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad +@@ -390,6 +706,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad + return len; + } + ++static void badblocks_update_acked(struct badblocks *bb) ++{ ++ bool unacked = false; ++ u64 *p = bb->page; ++ int i; ++ ++ if (!bb->unacked_exist) ++ return; ++ ++ for (i = 0; i < bb->count ; i++) { ++ if (!BB_ACK(p[i])) { ++ unacked = true; ++ break; ++ } ++ } ++ ++ if (!unacked) ++ bb->unacked_exist = 0; ++} ++ ++/* Do exact work to set bad block range into the bad block table */ ++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged) ++{ ++ int retried = 0, space_desired = 0; ++ int orig_len, len = 0, added = 0; ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ sector_t orig_start; ++ unsigned long flags; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ /* round the start down, and the end up */ ++ sector_t next = s + sectors; ++ ++ rounddown(s, bb->shift); ++ roundup(next, bb->shift); ++ sectors = next - s; ++ } ++ ++ write_seqlock_irqsave(&bb->lock, flags); ++ ++ orig_start = s; ++ orig_len = sectors; ++ bad.ack = acknowledged; ++ p = bb->page; ++ ++re_insert: ++ bad.start = s; ++ bad.len = sectors; ++ len = 0; ++ ++ if (badblocks_empty(bb)) { ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start before all badblocks */ ++ if (prev < 0) { ++ if (!badblocks_full(bb)) { ++ /* insert on the first */ ++ if (bad.len > (BB_OFFSET(p[0]) - bad.start)) ++ bad.len = BB_OFFSET(p[0]) - bad.start; ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* No sapce, try to merge */ ++ if (overlap_behind(bb, &bad, 0)) { ++ if (can_merge_behind(bb, &bad, 0)) { ++ len = behind_merge(bb, &bad, 0); ++ added++; ++ } else { ++ len = min_t(sector_t, ++ BB_OFFSET(p[0]) - s, sectors); ++ space_desired = 1; ++ } ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* no table space and give up */ ++ goto out; ++ } ++ ++ /* in case p[prev-1] can be merged with p[prev] */ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ } else { ++ int extra = 0; ++ ++ if (!can_front_overwrite(bb, prev, &bad, &extra)) { ++ len = min_t(sector_t, ++ BB_END(p[prev]) - s, sectors); ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ len = front_overwrite(bb, prev, &bad, extra); ++ added++; ++ bb->count += extra; ++ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ } ++ } ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ /* if no space in table, still try to merge in the covered range */ ++ if (badblocks_full(bb)) { ++ /* skip the cannot-merge range */ ++ if (((prev + 1) < bb->count) && ++ overlap_behind(bb, &bad, prev + 1) && ++ ((s + sectors) >= BB_END(p[prev + 1]))) { ++ len = BB_END(p[prev + 1]) - s; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* no retry any more */ ++ len = sectors; ++ space_desired = 1; ++ hint = -1; ++ goto update_sectors; ++ } ++ ++ /* cannot merge and there is space in bad table */ ++ if ((prev + 1) < bb->count && ++ overlap_behind(bb, &bad, prev + 1)) ++ bad.len = min_t(sector_t, ++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start); ++ ++ len = insert_at(bb, prev + 1, &bad); ++ bb->count++; ++ added++; ++ hint = prev + 1; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_insert; ++ ++ WARN_ON(sectors < 0); ++ ++ /* Check whether the following already set range can be merged */ ++ if ((prev + 1) < bb->count && ++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && ++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && ++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), ++ BB_ACK(p[prev])); ++ ++ if ((prev + 2) < bb->count) ++ memmove(p + prev + 1, p + prev + 2, ++ (bb->count - (prev + 2)) * 8); ++ bb->count--; ++ } ++ ++ if (space_desired && !badblocks_full(bb)) { ++ s = orig_start; ++ sectors = orig_len; ++ space_desired = 0; ++ if (retried++ < 3) ++ goto re_insert; ++ } ++ ++out: ++ if (added) { ++ set_changed(bb); ++ ++ if (!acknowledged) ++ bb->unacked_exist = 1; ++ else ++ badblocks_update_acked(bb); ++ } ++ ++ write_sequnlock_irqrestore(&bb->lock, flags); ++ ++ if (!added) ++ rv = 1; ++ ++ return rv; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +@@ -499,26 +1039,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + } + EXPORT_SYMBOL_GPL(badblocks_check); + +-static void badblocks_update_acked(struct badblocks *bb) +-{ +- u64 *p = bb->page; +- int i; +- bool unacked = false; +- +- if (!bb->unacked_exist) +- return; +- +- for (i = 0; i < bb->count ; i++) { +- if (!BB_ACK(p[i])) { +- unacked = true; +- break; +- } +- } +- +- if (!unacked) +- bb->unacked_exist = 0; +-} +- + /** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch new file mode 100644 index 0000000..ad5cfc3 --- /dev/null +++ b/for-test/badblocks/v4/v4-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch @@ -0,0 +1,399 @@ +From a7120f4e3a771de6f6c682798b0e9ebf3c6fcb49 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 22:16:10 +0800 +Subject: [PATCH v4 4/6] badblocks: improve badblocks_clear() for multiple + ranges handling + +With the fundamental ideas and helper routines from badblocks_set() +improvement, clearing bad block for multiple ranges is much simpler. + +With a similar idea from badblocks_set() improvement, this patch +simplifies bad block range clearing into 5 situations. No matter how +complicated the clearing condition is, we just look at the head part +of clearing range with relative already set bad block range from the +bad block table. The rested part will be handled in next run of the +while-loop. + +Based on existing helpers added from badblocks_set(), this patch adds +two more helpers, +- front_clear() + Clear the bad block range from bad block table which is front + overlapped with the clearing range. +- front_splitting_clear() + Handle the condition that the clearing range hits middle of an + already set bad block range from bad block table. + +Similar as badblocks_set(), the first part of clearing range is handled +with relative bad block range which is find by prev_badblocks(). In most +cases a valid hint is provided to prev_badblocks() to avoid unnecessary +bad block table iteration. + +This patch also explains the detail algorithm code comments at beginning +of badblocks.c, including which five simplified situations are +categrized and how all the bad block range clearing conditions are +handled by these five situations. + +Again, in order to make the code review easier and avoid the code +changes mixed together, this patch does not modify badblock_clear() and +implement another routine called _badblock_clear() for the improvement. +Later patch will delete current code of badblock_clear() and make it as +a wrapper to _badblock_clear(), so the code change can be much clear for +review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 325 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index 13eaad18be15..c188b2e98140 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -330,6 +330,123 @@ + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. ++ * ++ * ++ * Clearing a bad blocks range from the bad block table has similar idea as ++ * setting does, but much more simpler. The only thing needs to be noticed is ++ * when the clearing range hits middle of a bad block range, the existing bad ++ * block range will split into two, and one more item should be added into the ++ * bad block table. The simplified situations to be considered are, (The already ++ * set bad blocks ranges in bad block table are naming with prefix E, and the ++ * clearing bad blocks range is naming with prefix C) ++ * ++ * 1) A clearing range is not overlapped to any already set ranges in bad block ++ * table. ++ * +-----+ | +-----+ | +-----+ ++ * | C | | | C | | | C | ++ * +-----+ or +-----+ or +-----+ ++ * +---+ | +----+ +----+ | +---+ ++ * | E | | | E1 | | E2 | | | E | ++ * +---+ | +----+ +----+ | +---+ ++ * For the above situations, no bad block to be cleared and no failure ++ * happens, simply returns 0. ++ * 2) The clearing range hits middle of an already setting bad blocks range in ++ * the bad block table. ++ * +---+ ++ * | C | ++ * +---+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * In this situation if the bad block table is not full, the range E will be ++ * split into two ranges E1 and E2. The result is, ++ * +------+ +------+ ++ * | E1 | | E2 | ++ * +------+ +------+ ++ * 3) The clearing range starts exactly at same LBA as an already set bad block range ++ * from the bad block table. ++ * 3.1) Partially covered at head part ++ * +------------+ ++ * | C | ++ * +------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation, the overlapped already set range will update the ++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No ++ * item deleted from bad block table. The result is, ++ * +----+ ++ * | E1 | ++ * +----+ ++ * 3.2) Exact fully covered ++ * +-----------------+ ++ * | C | ++ * +-----------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation the whole bad blocks range E will be cleared and its ++ * corresponded item is deleted from the bad block table. ++ * 4) The clearing range exactly ends at same LBA as an already set bad block ++ * range. ++ * +-------+ ++ * | C | ++ * +-------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For the above situation, the already set range E is updated to shrink its ++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). ++ * The result is, ++ * +---------+ ++ * | E | ++ * +---------+ ++ * 5) The clearing range is partially overlapped with an already set bad block ++ * range from the bad block table. ++ * 5.1) The already set bad block range is front overlapped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part ends at the start LBA of range E, and the second part starts at ++ * same LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be ++ * handled as condition 3.1) in next loop. ++ * 5.2) The already set bad block range is behind overlaopped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part C1 ends at same end LBA of range E, and the second part starts ++ * at end LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part clearing range C1 can be handled as condition 4), and ++ * the second part clearing range C2 can be handled as condition 1) in next ++ * loop. ++ * ++ * All bad blocks range clearing can be simplified into the above 5 situations ++ * by only handling the head part of the clearing range in each run of the ++ * while-loop. The idea is similar to bad blocks range setting but much ++ * simpler. + */ + + /* +@@ -930,6 +1047,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + return rv; + } + ++/* ++ * Clear the bad block range from bad block table which is front overlapped ++ * with the clearing range. The return value is how many sectors from an ++ * already set bad block range are cleared. If the whole bad block range is ++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for ++ * the caller to reduce bb->count. ++ */ ++static int front_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *deleted) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int cleared = 0; ++ ++ *deleted = 0; ++ if (s == BB_OFFSET(p[prev])) { ++ if (BB_LEN(p[prev]) > sectors) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, ++ BB_LEN(p[prev]) - sectors, ++ BB_ACK(p[prev])); ++ cleared = sectors; ++ } else { ++ /* BB_LEN(p[prev]) <= sectors */ ++ cleared = BB_LEN(p[prev]); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ *deleted = 1; ++ } ++ } else if (s > BB_OFFSET(p[prev])) { ++ if (BB_END(p[prev]) <= (s + sectors)) { ++ cleared = BB_END(p[prev]) - s; ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ } else { ++ /* Splitting is handled in front_splitting_clear() */ ++ BUG(); ++ } ++ } ++ ++ return cleared; ++} ++ ++/* ++ * Handle the condition that the clearing range hits middle of an already set ++ * bad block range from bad block table. In this condition the existing bad ++ * block range is split into two after the middle part is cleared. ++ */ ++static int front_splitting_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ u64 end = BB_END(p[prev]); ++ int ack = BB_ACK(p[prev]); ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ ack); ++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); ++ return sectors; ++} ++ ++/* Do the exact work to clear bad block range from the bad block table */ ++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) ++{ ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ int len = 0, cleared = 0; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ sector_t target; ++ ++ /* When clearing we round the start up and the end down. ++ * This should not matter as the shift should align with ++ * the block size and no rounding should ever be needed. ++ * However it is better the think a block is bad when it ++ * isn't than to think a block is not bad when it is. ++ */ ++ target = s + sectors; ++ roundup(s, bb->shift); ++ rounddown(target, bb->shift); ++ sectors = target - s; ++ } ++ ++ write_seqlock_irq(&bb->lock); ++ ++ bad.ack = true; ++ p = bb->page; ++ ++re_clear: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* Start before all badblocks */ ++ if (prev < 0) { ++ if (overlap_behind(bb, &bad, 0)) { ++ len = BB_OFFSET(p[0]) - s; ++ hint = prev; ++ } else { ++ len = sectors; ++ } ++ /* ++ * Both situations are to clear non-bad range, ++ * should be treated as successful ++ */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Clear will split a bad record but the table is full */ ++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + sectors))) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if ((BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + bad.len))) { ++ /* Splitting */ ++ if ((bb->count + 1) < MAX_BADBLOCKS) { ++ len = front_splitting_clear(bb, prev, &bad); ++ bb->count += 1; ++ cleared++; ++ } else { ++ /* No space to split, give up */ ++ len = sectors; ++ } ++ } else { ++ int deleted = 0; ++ ++ len = front_clear(bb, prev, &bad, &deleted); ++ bb->count -= deleted; ++ cleared++; ++ hint = prev; ++ } ++ ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Not cover any badblocks range in the table */ ++ len = sectors; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_clear; ++ ++ WARN_ON(sectors < 0); ++ ++ if (cleared) { ++ badblocks_update_acked(bb); ++ set_changed(bb); ++ } ++ ++ write_sequnlock_irq(&bb->lock); ++ ++ if (!cleared) ++ rv = 1; ++ ++ return rv; ++} ++ ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch new file mode 100644 index 0000000..e519560 --- /dev/null +++ b/for-test/badblocks/v4/v4-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch @@ -0,0 +1,175 @@ +From 88b4c165ef9827f0febe7a527faea2a0d99feb66 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 16:13:35 +0800 +Subject: [PATCH v4 5/6] badblocks: improve badblocks_check() for multiple + ranges handling + +This patch rewrites badblocks_check() with similar coding style as +_badblocks_set() and _badblocks_clear(). The only difference is bad +blocks checking may handle multiple ranges in bad tables now. + +If a checking range covers multiple bad blocks range in bad block table, +like the following condition (C is the checking range, E1, E2, E3 are +three bad block ranges in bad block table), + +------------------------------------+ + | C | + +------------------------------------+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +The improved badblocks_check() algorithm will divide checking range C +into multiple parts, and handle them in 7 runs of a while-loop, + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 | + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +And the start LBA and length of range E1 will be set as first_bad and +bad_sectors for the caller. + +The return value rule is consistent for multiple ranges. For example if +there are following bad block ranges in bad block table, + Index No. Start Len Ack + 0 400 20 1 + 1 500 50 1 + 2 650 20 0 +the return value, first_bad, bad_sectors by calling badblocks_set() with +different checking range can be the following values, + Checking Start, Len Return Value first_bad bad_sectors + 100, 100 0 N/A N/A + 100, 310 1 400 10 + 100, 440 1 400 10 + 100, 540 1 400 10 + 100, 600 -1 400 10 + 100, 800 -1 400 10 + +In order to make code review easier, this patch names the improved bad +block range checking routine as _badblocks_check() and does not change +existing badblock_check() code yet. Later patch will delete old code of +badblocks_check() and make it as a wrapper to call _badblocks_check(). +Then the new added code won't mess up with the old deleted code, it will +be more clear and easier for code review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 97 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index c188b2e98140..f16c54925275 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1254,6 +1254,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + return rv; + } + ++/* Do the exact work to check bad blocks range from the bad block table */ ++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors) ++{ ++ int unacked_badblocks, acked_badblocks; ++ int prev = -1, hint = -1, set = 0; ++ struct badblocks_context bad; ++ unsigned int seq; ++ int len, rv; ++ u64 *p; ++ ++ WARN_ON(bb->shift < 0 || sectors == 0); ++ ++ if (bb->shift > 0) { ++ sector_t target; ++ ++ /* round the start down, and the end up */ ++ target = s + sectors; ++ rounddown(s, bb->shift); ++ roundup(target, bb->shift); ++ sectors = target - s; ++ } ++ ++retry: ++ seq = read_seqbegin(&bb->lock); ++ ++ p = bb->page; ++ unacked_badblocks = 0; ++ acked_badblocks = 0; ++ ++re_check: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (BB_ACK(p[prev])) ++ acked_badblocks++; ++ else ++ unacked_badblocks++; ++ ++ if (BB_END(p[prev]) >= (s + sectors)) ++ len = sectors; ++ else ++ len = BB_END(p[prev]) - s; ++ ++ if (set == 0) { ++ *first_bad = BB_OFFSET(p[prev]); ++ *bad_sectors = BB_LEN(p[prev]); ++ set = 1; ++ } ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* not cover any badblocks range in the table */ ++ len = sectors; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_check; ++ ++ WARN_ON(sectors < 0); ++ ++ if (unacked_badblocks > 0) ++ rv = -1; ++ else if (acked_badblocks > 0) ++ rv = 1; ++ else ++ rv = 0; ++ ++ if (read_seqretry(&bb->lock, seq)) ++ goto retry; ++ ++ return rv; ++} + + /** + * badblocks_check() - check a given range for bad sectors +-- +2.31.1 + diff --git a/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch new file mode 100644 index 0000000..17b7597 --- /dev/null +++ b/for-test/badblocks/v4/v4-0006-badblocks-switch-to-the-improved-badblock-handlin.patch @@ -0,0 +1,365 @@ +From 839dec5ce2a8e6fae537d8eaa5bc4c7ae89e8a49 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 2 Mar 2021 10:48:43 +0800 +Subject: [PATCH v4 6/6] badblocks: switch to the improved badblock handling + code + +This patch removes old code of badblocks_set(), badblocks_clear() and +badblocks_check(), and make them as wrappers to call _badblocks_set(), +_badblocks_clear() and _badblocks_check(). + +By this change now the badblock handing switch to the improved algorithm +in _badblocks_set(), _badblocks_clear() and _badblocks_check(). + +This patch only contains the changes of old code deletion, new added +code for the improved algorithms are in previous patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 310 +--------------------------------------------- + 1 file changed, 3 insertions(+), 307 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index f16c54925275..4838750811ca 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1389,75 +1389,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) + { +- int hi; +- int lo; +- u64 *p = bb->page; +- int rv; +- sector_t target = s + sectors; +- unsigned seq; +- +- if (bb->shift > 0) { +- /* round the start down, and the end up */ +- s >>= bb->shift; +- target += (1<<bb->shift) - 1; +- target >>= bb->shift; +- sectors = target - s; +- } +- /* 'target' is now the first block after the bad range */ +- +-retry: +- seq = read_seqbegin(&bb->lock); +- lo = 0; +- rv = 0; +- hi = bb->count; +- +- /* Binary search between lo and hi for 'target' +- * i.e. for the last range that starts before 'target' +- */ +- /* INVARIANT: ranges before 'lo' and at-or-after 'hi' +- * are known not to be the last range before target. +- * VARIANT: hi-lo is the number of possible +- * ranges, and decreases until it reaches 1 +- */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- /* This could still be the one, earlier ranges +- * could not. +- */ +- lo = mid; +- else +- /* This and later ranges are definitely out. */ +- hi = mid; +- } +- /* 'lo' might be the last that started before target, but 'hi' isn't */ +- if (hi > lo) { +- /* need to check all range that end after 's' to see if +- * any are unacknowledged. +- */ +- while (lo >= 0 && +- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { +- if (BB_OFFSET(p[lo]) < target) { +- /* starts before the end, and finishes after +- * the start, so they must overlap +- */ +- if (rv != -1 && BB_ACK(p[lo])) +- rv = 1; +- else +- rv = -1; +- *first_bad = BB_OFFSET(p[lo]); +- *bad_sectors = BB_LEN(p[lo]); +- } +- lo--; +- } +- } +- +- if (read_seqretry(&bb->lock, seq)) +- goto retry; +- +- return rv; ++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + } + EXPORT_SYMBOL_GPL(badblocks_check); + +@@ -1479,154 +1411,7 @@ EXPORT_SYMBOL_GPL(badblocks_check); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) + { +- u64 *p; +- int lo, hi; +- int rv = 0; +- unsigned long flags; +- +- if (bb->shift < 0) +- /* badblocks are disabled */ +- return 1; +- +- if (bb->shift) { +- /* round the start down, and the end up */ +- sector_t next = s + sectors; +- +- s >>= bb->shift; +- next += (1<<bb->shift) - 1; +- next >>= bb->shift; +- sectors = next - s; +- } +- +- write_seqlock_irqsave(&bb->lock, flags); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts at-or-before 's' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a <= s) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo && BB_OFFSET(p[lo]) > s) +- hi = lo; +- +- if (hi > lo) { +- /* we found a range that might merge with the start +- * of our new range +- */ +- sector_t a = BB_OFFSET(p[lo]); +- sector_t e = a + BB_LEN(p[lo]); +- int ack = BB_ACK(p[lo]); +- +- if (e >= s) { +- /* Yes, we can merge with a previous range */ +- if (s == a && s + sectors >= e) +- /* new range covers old */ +- ack = acknowledged; +- else +- ack = ack && acknowledged; +- +- if (e < s + sectors) +- e = s + sectors; +- if (e - a <= BB_MAX_LEN) { +- p[lo] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- /* does not all fit in one range, +- * make p[lo] maximal +- */ +- if (BB_LEN(p[lo]) != BB_MAX_LEN) +- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- } +- } +- if (sectors && hi < bb->count) { +- /* 'hi' points to the first range that starts after 's'. +- * Maybe we can merge with the start of that range +- */ +- sector_t a = BB_OFFSET(p[hi]); +- sector_t e = a + BB_LEN(p[hi]); +- int ack = BB_ACK(p[hi]); +- +- if (a <= s + sectors) { +- /* merging is possible */ +- if (e <= s + sectors) { +- /* full overlap */ +- e = s + sectors; +- ack = acknowledged; +- } else +- ack = ack && acknowledged; +- +- a = s; +- if (e - a <= BB_MAX_LEN) { +- p[hi] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- lo = hi; +- hi++; +- } +- } +- if (sectors == 0 && hi < bb->count) { +- /* we might be able to combine lo and hi */ +- /* Note: 's' is at the end of 'lo' */ +- sector_t a = BB_OFFSET(p[hi]); +- int lolen = BB_LEN(p[lo]); +- int hilen = BB_LEN(p[hi]); +- int newlen = lolen + hilen - (s - a); +- +- if (s >= a && newlen < BB_MAX_LEN) { +- /* yes, we can combine them */ +- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); +- +- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); +- memmove(p + hi, p + hi + 1, +- (bb->count - hi - 1) * 8); +- bb->count--; +- } +- } +- while (sectors) { +- /* didn't merge (it all). +- * Need to add a range just before 'hi' +- */ +- if (bb->count >= MAX_BADBLOCKS) { +- /* No room for more */ +- rv = 1; +- break; +- } else { +- int this_sectors = sectors; +- +- memmove(p + hi + 1, p + hi, +- (bb->count - hi) * 8); +- bb->count++; +- +- if (this_sectors > BB_MAX_LEN) +- this_sectors = BB_MAX_LEN; +- p[hi] = BB_MAKE(s, this_sectors, acknowledged); +- sectors -= this_sectors; +- s += this_sectors; +- } +- } +- +- bb->changed = 1; +- if (!acknowledged) +- bb->unacked_exist = 1; +- else +- badblocks_update_acked(bb); +- write_sequnlock_irqrestore(&bb->lock, flags); +- +- return rv; ++ return _badblocks_set(bb, s, sectors, acknowledged); + } + EXPORT_SYMBOL_GPL(badblocks_set); + +@@ -1646,96 +1431,7 @@ EXPORT_SYMBOL_GPL(badblocks_set); + */ + int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + { +- u64 *p; +- int lo, hi; +- sector_t target = s + sectors; +- int rv = 0; +- +- if (bb->shift > 0) { +- /* When clearing we round the start up and the end down. +- * This should not matter as the shift should align with +- * the block size and no rounding should ever be needed. +- * However it is better the think a block is bad when it +- * isn't than to think a block is not bad when it is. +- */ +- s += (1<<bb->shift) - 1; +- s >>= bb->shift; +- target >>= bb->shift; +- sectors = target - s; +- } +- +- write_seqlock_irq(&bb->lock); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts before 'target' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo) { +- /* p[lo] is the last range that could overlap the +- * current range. Earlier ranges could also overlap, +- * but only this one can overlap the end of the range. +- */ +- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && +- (BB_OFFSET(p[lo]) < target)) { +- /* Partial overlap, leave the tail of this range */ +- int ack = BB_ACK(p[lo]); +- sector_t a = BB_OFFSET(p[lo]); +- sector_t end = a + BB_LEN(p[lo]); +- +- if (a < s) { +- /* we need to split this range */ +- if (bb->count >= MAX_BADBLOCKS) { +- rv = -ENOSPC; +- goto out; +- } +- memmove(p+lo+1, p+lo, (bb->count - lo) * 8); +- bb->count++; +- p[lo] = BB_MAKE(a, s-a, ack); +- lo++; +- } +- p[lo] = BB_MAKE(target, end - target, ack); +- /* there is no longer an overlap */ +- hi = lo; +- lo--; +- } +- while (lo >= 0 && +- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && +- (BB_OFFSET(p[lo]) < target)) { +- /* This range does overlap */ +- if (BB_OFFSET(p[lo]) < s) { +- /* Keep the early parts of this range. */ +- int ack = BB_ACK(p[lo]); +- sector_t start = BB_OFFSET(p[lo]); +- +- p[lo] = BB_MAKE(start, s - start, ack); +- /* now low doesn't overlap, so.. */ +- break; +- } +- lo--; +- } +- /* 'lo' is strictly before, 'hi' is strictly after, +- * anything between needs to be discarded +- */ +- if (hi - lo > 1) { +- memmove(p+lo+1, p+hi, (bb->count - hi) * 8); +- bb->count -= (hi - lo - 1); +- } +- } +- +- badblocks_update_acked(bb); +- bb->changed = 1; +-out: +- write_sequnlock_irq(&bb->lock); +- return rv; ++ return _badblocks_clear(bb, s, sectors); + } + EXPORT_SYMBOL_GPL(badblocks_clear); + +-- +2.31.1 + diff --git a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch index 091d4d3..c354234 100644 --- a/for-test/badblocks/v2/v2-0007-test-user-space-code-to-test-badblocks-APIs.patch +++ b/for-test/badblocks/v4/v4-0007-test-user-space-code-to-test-badblocks-APIs.patch @@ -255,19 +255,19 @@ index 0000000..ca52647 + * + * When the caller of badblocks_set() wants to set a range of bad blocks, the + * setting range can be acked or unacked. And the setting range may merge, -+ * overwrite, skip the overlaypped already set range, depends on who they are ++ * overwrite, skip the overlapped already set range, depends on who they are + * overlapped or adjacent, and the acknowledgment type of the ranges. It can be + * more complicated when the setting range covers multiple already set bad block -+ * ranges, with restritctions of maximum length of each bad range and the bad ++ * ranges, with restrictions of maximum length of each bad range and the bad + * table space limitation. + * -+ * It is difficut and unnecessary to take care of all the possible situations, ++ * It is difficult and unnecessary to take care of all the possible situations, + * for setting a large range of bad blocks, we can handle it by dividing the + * large range into smaller ones when encounter overlap, max range length or + * bad table full conditions. Every time only a smaller piece of the bad range + * is handled with a limited number of conditions how it is interacted with + * possible overlapped or adjacent already set bad block ranges. Then the hard -+ * complicated problem can be much simpler to habndle in proper way. ++ * complicated problem can be much simpler to handle in proper way. + * + * When setting a range of bad blocks to the bad table, the simplified situations + * to be considered are, (The already set bad blocks ranges are naming with @@ -301,12 +301,12 @@ index 0000000..ca52647 + * +-------------+ + * | S | + * +-------------+ -+ * 2.1.2) If S is uncked setting and E is acked, the setting will be dinied, and ++ * 2.1.2) If S is unacked setting and E is acked, the setting will be dinied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ -+ * 2.1.3) If S is acked setting and E is unacked, range S can overwirte on E. ++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. + * An extra slot from the bad blocks table will be allocated for S, and head + * of E will move to end of the inserted range E. The result is, + * +--------+----+ @@ -318,12 +318,12 @@ index 0000000..ca52647 + * +-------------+ + * | S | + * +-------------+ -+ * 2.2.2) If S is uncked setting and E is acked, the setting will be dinied, and ++ * 2.2.2) If S is unacked setting and E is acked, the setting will be dinied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ -+ * 2.2.3) If S is acked setting and E is unacked, range S can overwirte all of ++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of + bad blocks range E. The result is, + * +-------------+ + * | S | @@ -378,7 +378,7 @@ index 0000000..ca52647 + * +-------------+ + * | E | + * +-------------+ -+ * 4.1.1) If range S and E have same ackknowledg value (both acked or unacked), ++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +-------------+ + * | S | @@ -401,7 +401,7 @@ index 0000000..ca52647 + * +--------------+ + * | E | + * +--------------+ -+ * 4.2.1) If range S and E have same ackknowledg value (both acked or unacked), ++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +--------------+ + * | S | @@ -504,7 +504,7 @@ index 0000000..ca52647 + * +-------+-------------+ E1: 1 + * | E1 | E2 | E2: 0 + * +-------+-------------+ -+ * With previosu simplified routines, after overwiting part of E2 with S, ++ * With previous simplified routines, after overwriting part of E2 with S, + * the bad blocks table should be (E3 is remaining part of E2 which is not + * overwritten by S), + * acknowledged @@ -514,7 +514,7 @@ index 0000000..ca52647 + * The above result is correct but not perfect. Range E1 and S in the bad + * blocks table are all acked, merging them into a larger one range may + * occupy less bad blocks table space and make badblocks_check() faster. -+ * Therefore in such situation, after overwiting range S, the previous range ++ * Therefore in such situation, after overwriting range S, the previous range + * E1 should be checked for possible front combination. Then the ideal + * result can be, + * +----------------+----+ acknowledged @@ -533,7 +533,7 @@ index 0000000..ca52647 + * +-------+ + * | E | + * +-------+ -+ * For the above special stiuation, when the setting range S are all handled ++ * For the above special situation, when the setting range S are all handled + * and the loop ends, an extra check is necessary for whether next already + * set range E is right after S and mergeable. + * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge @@ -543,7 +543,7 @@ index 0000000..ca52647 + * | S | + * +--------------+ + * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range -+ * S infront of the already set range E in the bad blocks table. The result ++ * S in front of the already set range E in the bad blocks table. The result + * is, + * +------+-------+ + * | S | E | @@ -567,9 +567,9 @@ index 0000000..ca52647 + * + * Clearing a bad blocks range from the bad block table has similar idea as + * setting does, but much more simpler. The only thing needs to be noticed is -+ * when the clearning range hits middle of a bad block range, the existing bad ++ * when the clearing range hits middle of a bad block range, the existing bad + * block range will split into two, and one more item should be added into the -+ * bad block table. The simplified situations to beconsidered are, (The already ++ * bad block table. The simplified situations to be considered are, (The already + * set bad blocks ranges in bad block table are naming with prefix E, and the + * clearing bad blocks range is naming with prefix C) + * diff --git a/for-test/badblocks/v5/v5-0000-cover-letter.patch b/for-test/badblocks/v5/v5-0000-cover-letter.patch new file mode 100644 index 0000000..efd498c --- /dev/null +++ b/for-test/badblocks/v5/v5-0000-cover-letter.patch @@ -0,0 +1,70 @@ +From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 10 Dec 2021 15:27:33 +0800 +Subject: [PATCH v5 0/7] badblocks improvement for multiple bad block ranges + +Hi folks, + +This is the v5 effort to improve badblocks code APIs to handle multiple +ranges in bad block table. + +Comparing to previous v4 series, the changes in v5 series include, +- Typos in code comments which are pointed out by Geliang Tang and + Wols Lists. +- Drop extra local variables in helper routines which suggested by + Geliang Tang. +- Change the user space testing code with all above changes. + +There is NO in-memory or on-disk format change in the whole series, all +existing API and data structures are consistent. This series just only +improve the code algorithm to handle more corner cases, the interfaces +are same and consistency to all existing callers (md raid and nvdimm +drivers). + +The original motivation of the change is from the requirement from our +customer, that current badblocks routines don't handle multiple ranges. +For example if the bad block setting range covers multiple ranges from +bad block table, only the first two bad block ranges merged and rested +ranges are intact. The expected behavior should be all the covered +ranges to be handled. + +All the patches are tested by modified user space code and the code +logic works as expected. The modified user space testing code is +provided in the last patch. The testing code is an example how the +improved code is tested. + +The whole change is divided into 6 patches to make the code review more +clear and easier. If people prefer, I'd like to post a single large +patch finally after the code review accomplished. + +Please review the code and response. Thank you all in advance. + +Coly Li + +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Richard Fan <richard.fan@suse.com> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +Cc: Wols Lists <antlists@youngman.org.uk> +--- + +Coly Li (6): + badblocks: add more helper structure and routines in badblocks.h + badblocks: add helper routines for badblock ranges handling + badblocks: improve badblocks_set() for multiple ranges handling + badblocks: improve badblocks_clear() for multiple ranges handling + badblocks: improve badblocks_check() for multiple ranges handling + badblocks: switch to the improved badblock handling code +Coly Li (1): + test: user space code to test badblocks APIs + + block/badblocks.c | 1604 ++++++++++++++++++++++++++++++------- + include/linux/badblocks.h | 30 + + 2 files changed, 1339 insertions(+), 295 deletions(-) + +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch new file mode 100644 index 0000000..d66b0c8 --- /dev/null +++ b/for-test/badblocks/v5/v5-0001-badblocks-add-more-helper-structure-and-routines-.patch @@ -0,0 +1,91 @@ +From d5352d6d537923232aa274cc753366a7851a1f13 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 15:29:38 +0800 +Subject: [PATCH v5 1/6] badblocks: add more helper structure and routines in + badblocks.h + +This patch adds the following helper structure and routines into +badblocks.h, +- struct badblocks_context + This structure is used in improved badblocks code for bad table + iteration. +- BB_END() + The macro to calculate end LBA of a bad range record from bad + table. +- badblocks_full() and badblocks_empty() + The inline routines to check whether bad table is full or empty. +- set_changed() and clear_changed() + The inline routines to set and clear 'changed' tag from struct + badblocks. + +These new helper structure and routines can help to make the code more +clear, they will be used in the improved badblocks code in following +patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h +index 2426276b9bd3..670f2dae692f 100644 +--- a/include/linux/badblocks.h ++++ b/include/linux/badblocks.h +@@ -15,6 +15,7 @@ + #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) + #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) + #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) ++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x)) + #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + + /* Bad block numbers are stored sorted in a single page. +@@ -41,6 +42,12 @@ struct badblocks { + sector_t size; /* in sectors */ + }; + ++struct badblocks_context { ++ sector_t start; ++ sector_t len; ++ int ack; ++}; ++ + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, +@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) + } + badblocks_exit(bb); + } ++ ++static inline int badblocks_full(struct badblocks *bb) ++{ ++ return (bb->count >= MAX_BADBLOCKS); ++} ++ ++static inline int badblocks_empty(struct badblocks *bb) ++{ ++ return (bb->count == 0); ++} ++ ++static inline void set_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 1) ++ bb->changed = 1; ++} ++ ++static inline void clear_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 0) ++ bb->changed = 0; ++} ++ + #endif +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch new file mode 100644 index 0000000..fc084aa --- /dev/null +++ b/for-test/badblocks/v5/v5-0002-badblocks-add-helper-routines-for-badblock-ranges.patch @@ -0,0 +1,459 @@ +From 2accaa280961524bc5eea98399906d199eea2568 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 17:16:57 +0800 +Subject: [PATCH v5 2/6] badblocks: add helper routines for badblock ranges + handling + +This patch adds several helper routines to improve badblock ranges +handling. These helper routines will be used later in the improved +version of badblocks_set()/badblocks_clear()/badblocks_check(). + +- Helpers prev_by_hint() and prev_badblocks() are used to find the bad + range from bad table which the searching range starts at or after. + +- The following helpers are to decide the relative layout between the + manipulating range and existing bad block range from bad table. + - can_merge_behind() + Return 'true' if the manipulating range can backward merge with the + bad block range. + - can_merge_front() + Return 'true' if the manipulating range can forward merge with the + bad block range. + - can_combine_front() + Return 'true' if two adjacent bad block ranges before the + manipulating range can be merged. + - overlap_front() + Return 'true' if the manipulating range exactly overlaps with the + bad block range in front of its range. + - overlap_behind() + Return 'true' if the manipulating range exactly overlaps with the + bad block range behind its range. + - can_front_overwrite() + Return 'true' if the manipulating range can forward overwrite the + bad block range in front of its range. + +- The following helpers are to add the manipulating range into the bad + block table. Different routine is called with the specific relative + layout between the manipulating range and other bad block range in the + bad block table. + - behind_merge() + Merge the manipulating range with the bad block range behind its + range, and return the number of merged length in unit of sector. + - front_merge() + Merge the manipulating range with the bad block range in front of + its range, and return the number of merged length in unit of sector. + - front_combine() + Combine the two adjacent bad block ranges before the manipulating + range into a larger one. + - front_overwrite() + Overwrite partial of whole bad block range which is in front of the + manipulating range. The overwrite may split existing bad block range + and generate more bad block ranges into the bad block table. + - insert_at() + Insert the manipulating range at a specific location in the bad + block table. + +All the above helpers are used in later patches to improve the bad block +ranges handling for badblocks_set()/badblocks_clear()/badblocks_check(). + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 376 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index d39056630d9c..30958cc4469f 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,382 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * Find the range starts at-or-before 's' from bad table. The search ++ * starts from index 'hint' and stops at index 'hint_end' from the bad ++ * table. ++ */ ++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) ++{ ++ int hint_end = hint + 2; ++ u64 *p = bb->page; ++ int ret = -1; ++ ++ while ((hint < hint_end) && ((hint + 1) <= bb->count) && ++ (BB_OFFSET(p[hint]) <= s)) { ++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { ++ ret = hint; ++ break; ++ } ++ hint++; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Find the range starts at-or-before bad->start. If 'hint' is provided ++ * (hint >= 0) then search in the bad table from hint firstly. It is ++ * very probably the wanted bad range can be found from the hint index, ++ * then the unnecessary while-loop iteration can be avoided. ++ */ ++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, ++ int hint) ++{ ++ sector_t s = bad->start; ++ int ret = -1; ++ int lo, hi; ++ u64 *p; ++ ++ if (!bb->count) ++ goto out; ++ ++ if (hint >= 0) { ++ ret = prev_by_hint(bb, s, hint); ++ if (ret >= 0) ++ goto out; ++ } ++ ++ lo = 0; ++ hi = bb->count; ++ p = bb->page; ++ ++ while (hi - lo > 1) { ++ int mid = (lo + hi)/2; ++ sector_t a = BB_OFFSET(p[mid]); ++ ++ if (a <= s) ++ lo = mid; ++ else ++ hi = mid; ++ } ++ ++ if (BB_OFFSET(p[lo]) <= s) ++ ret = lo; ++out: ++ return ret; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be backward merged ++ * with the bad range (from the bad table) index by 'behind'. ++ */ ++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ ++ if ((s <= BB_OFFSET(p[behind])) && ++ ((s + sectors) >= BB_OFFSET(p[behind])) && ++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && ++ BB_ACK(p[behind]) == bad->ack) ++ return true; ++ return false; ++} ++ ++/* ++ * Do backward merge for range indicated by 'bad' and the bad range ++ * (from the bad table) indexed by 'behind'. The return value is merged ++ * sectors from bad->len. ++ */ ++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_OFFSET(p[behind])); ++ WARN_ON((s + sectors) < BB_OFFSET(p[behind])); ++ ++ if (s < BB_OFFSET(p[behind])) { ++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); ++ ++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s); ++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); ++ } else { ++ merged = min_t(sector_t, sectors, BB_LEN(p[behind])); ++ } ++ ++ WARN_ON(merged == 0); ++ ++ return merged; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be forward ++ * merged with the bad range (from the bad table) indexed by 'prev'. ++ */ ++static bool can_merge_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ ++ if (BB_ACK(p[prev]) == bad->ack && ++ (s < BB_END(p[prev]) || ++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) ++ return true; ++ return false; ++} ++ ++/* ++ * Do forward merge for range indicated by 'bad' and the bad range ++ * (from bad table) indexed by 'prev'. The return value is sectors ++ * merged from bad->len. ++ */ ++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_END(p[prev])); ++ ++ if (s < BB_END(p[prev])) { ++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); ++ } else { ++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); ++ if ((prev + 1) < bb->count && ++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { ++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); ++ } ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + merged, bad->ack); ++ } ++ ++ return merged; ++} ++ ++/* ++ * 'Combine' is a special case which can_merge_front() is not able to ++ * handle: If a bad range (indexed by 'prev' from bad table) exactly ++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by ++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and ++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then ++ * these two bad range (from bad table) can be combined. ++ * ++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad ++ * table can be combined. ++ */ ++static bool can_combine_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if ((prev > 0) && ++ (BB_OFFSET(p[prev]) == bad->start) && ++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && ++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && ++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) ++ return true; ++ return false; ++} ++ ++/* ++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad ++ * table) into one larger bad range, and the new range is indexed by ++ * 'prev - 1'. ++ */ ++static void front_combine(struct badblocks *bb, int prev) ++{ ++ u64 *p = bb->page; ++ ++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), ++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), ++ BB_ACK(p[prev])); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly forward ++ * overlapped with the bad range (from bad table) indexed by 'front'. ++ * Exactly forward overlap means the bad range (from bad table) indexed ++ * by 'prev' does not cover the whole range indicated by 'bad'. ++ */ ++static bool overlap_front(struct badblocks *bb, int front, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start >= BB_OFFSET(p[front]) && ++ bad->start < BB_END(p[front])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly backward ++ * overlapped with the bad range (from bad table) indexed by 'behind'. ++ */ ++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start < BB_OFFSET(p[behind]) && ++ (bad->start + bad->len) > BB_OFFSET(p[behind])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can overwrite the bad ++ * range (from bad table) indexed by 'prev'. ++ * ++ * The range indicated by 'bad' can overwrite the bad range indexed by ++ * 'prev' when, ++ * 1) The whole range indicated by 'bad' can cover partial or whole bad ++ * range (from bad table) indexed by 'prev'. ++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad ++ * range 'prev'. ++ * ++ * If the overwriting doesn't cover the whole bad range (from bad table) ++ * indexed by 'prev', new range might be split from existing bad range, ++ * 1) The overwrite covers head or tail part of existing bad range, 1 ++ * extra bad range will be split and added into the bad table. ++ * 2) The overwrite covers middle of existing bad range, 2 extra bad ++ * ranges will be split (ahead and after the overwritten range) and ++ * added into the bad table. ++ * The number of extra split ranges of the overwriting is stored in ++ * 'extra' and returned for the caller. ++ */ ++static bool can_front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *extra) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(!overlap_front(bb, prev, bad)); ++ ++ if (BB_ACK(p[prev]) >= bad->ack) ++ return false; ++ ++ if (BB_END(p[prev]) <= (bad->start + bad->len)) { ++ len = BB_END(p[prev]) - bad->start; ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 0; ++ else ++ *extra = 1; ++ ++ bad->len = len; ++ } else { ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 1; ++ else ++ /* ++ * prev range will be split into two, beside the overwritten ++ * one, an extra slot needed from bad table. ++ */ ++ *extra = 2; ++ } ++ ++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Do the overwrite from the range indicated by 'bad' to the bad range ++ * (from bad table) indexed by 'prev'. ++ * The previously called can_front_overwrite() will provide how many ++ * extra bad range(s) might be split and added into the bad table. All ++ * the splitting cases in the bad table will be handled here. ++ */ ++static int front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int extra) ++{ ++ u64 *p = bb->page; ++ sector_t orig_end = BB_END(p[prev]); ++ int orig_ack = BB_ACK(p[prev]); ++ ++ switch (extra) { ++ case 0: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), ++ bad->ack); ++ break; ++ case 1: ++ if (BB_OFFSET(p[prev]) == bad->start) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->len, bad->ack); ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start + bad->len, ++ orig_end - BB_END(p[prev]), ++ orig_ack); ++ } else { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ /* ++ * prev +2 -> prev + 1 + 1, which is for, ++ * 1) prev + 1: the slot index of the previous one ++ * 2) + 1: one more slot for extra being 1. ++ */ ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ } ++ break; ++ case 2: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ /* ++ * prev + 3 -> prev + 1 + 2, which is for, ++ * 1) prev + 1: the slot index of the previous one ++ * 2) + 2: two more slots for extra being 2. ++ */ ++ memmove(p + prev + 3, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), ++ orig_end - BB_END(p[prev + 1]), ++ BB_ACK(p[prev])); ++ break; ++ default: ++ break; ++ } ++ ++ return bad->len; ++} ++ ++/* ++ * Explicitly insert a range indicated by 'bad' to the bad table, where ++ * the location is indexed by 'at'. ++ */ ++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(badblocks_full(bb)); ++ ++ len = min_t(sector_t, bad->len, BB_MAX_LEN); ++ if (at < bb->count) ++ memmove(p + at + 1, p + at, (bb->count - at) * 8); ++ p[at] = BB_MAKE(bad->start, len, bad->ack); ++ ++ return len; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch new file mode 100644 index 0000000..d5e7ce8 --- /dev/null +++ b/for-test/badblocks/v5/v5-0003-badblocks-improve-badblocks_set-for-multiple-rang.patch @@ -0,0 +1,663 @@ +From cdb864aa796ef2e65a99561b50561c7beec8ab58 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 15:57:50 +0800 +Subject: [PATCH v5 3/6] badblocks: improve badblocks_set() for multiple ranges + handling + +Recently I received a bug report that current badblocks code does not +properly handle multiple ranges. For example, + badblocks_set(bb, 32, 1, true); + badblocks_set(bb, 34, 1, true); + badblocks_set(bb, 36, 1, true); + badblocks_set(bb, 32, 12, true); +Then indeed badblocks_show() reports, + 32 3 + 36 1 +But the expected bad blocks table should be, + 32 12 +Obviously only the first 2 ranges are merged and badblocks_set() returns +and ignores the rest setting range. + +This behavior is improper, if the caller of badblocks_set() wants to set +a range of blocks into bad blocks table, all of the blocks in the range +should be handled even the previous part encountering failure. + +The desired way to set bad blocks range by badblocks_set() is, +- Set as many as blocks in the setting range into bad blocks table. +- Merge the bad blocks ranges and occupy as less as slots in the bad + blocks table. +- Fast. + +Indeed the above proposal is complicated, especially with the following +restrictions, +- The setting bad blocks range can be acknowledged or not acknowledged. +- The bad blocks table size is limited. +- Memory allocation should be avoided. + +The basic idea of the patch is to categorize all possible bad blocks +range setting combinations into to much less simplified and more less +special conditions. Inside badblocks_set() there is an implicit loop +composed by jumping between labels 're_insert' and 'update_sectors'. No +matter how large the setting bad blocks range is, in every loop just a +minimized range from the head is handled by a pre-defined behavior from +one of the categorized conditions. The logic is simple and code flow is +manageable. + +The different relative layout between the setting range and existing bad +block range are checked and handled (merge, combine, overwrite, insert) +by the helpers in previous patch. This patch is to make all the helpers +work together with the above idea. + +This patch only has the algorithm improvement for badblocks_set(). There +are following patches contain improvement for badblocks_clear() and +badblocks_check(). But the algorithm in badblocks_set() is fundamental +and typical, other improvement in clear and check routines are based on +all the helpers and ideas in this patch. + +In order to make the change to be more clear for code review, this patch +does not directly modify existing badblocks_set(), and just add a new +one named _badblocks_set(). Later patch will remove current existing +badblocks_set() code and make it as a wrapper of _badblocks_set(). So +the new added change won't be mixed with deleted code, the code review +can be easier. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +Cc: Wols Lists <antlists@youngman.org.uk> +--- + block/badblocks.c | 560 ++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 540 insertions(+), 20 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index 30958cc4469f..f45f82646bb7 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -16,6 +16,322 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++/* ++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are ++ * identified by LBA addresses. ++ * ++ * When the caller of badblocks_set() wants to set a range of bad blocks, the ++ * setting range can be acked or unacked. And the setting range may merge, ++ * overwrite, skip the overlapped already set range, depends on who they are ++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be ++ * more complicated when the setting range covers multiple already set bad block ++ * ranges, with restrictions of maximum length of each bad range and the bad ++ * table space limitation. ++ * ++ * It is difficult and unnecessary to take care of all the possible situations, ++ * for setting a large range of bad blocks, we can handle it by dividing the ++ * large range into smaller ones when encounter overlap, max range length or ++ * bad table full conditions. Every time only a smaller piece of the bad range ++ * is handled with a limited number of conditions how it is interacted with ++ * possible overlapped or adjacent already set bad block ranges. Then the hard ++ * complicated problem can be much simpler to handle in proper way. ++ * ++ * When setting a range of bad blocks to the bad table, the simplified situations ++ * to be considered are, (The already set bad blocks ranges are naming with ++ * prefix E, and the setting bad blocks range is naming with prefix S) ++ * ++ * 1) A setting range is not overlapped or adjacent to any other already set bad ++ * block range. ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ +-------------+ ++ * | E1 | | E2 | ++ * +-------------+ +-------------+ ++ * For this situation if the bad blocks table is not full, just allocate a ++ * free slot from the bad blocks table to mark the setting range S. The ++ * result is, ++ * +-------------+ +--------+ +-------------+ ++ * | E1 | | S | | E2 | ++ * +-------------+ +--------+ +-------------+ ++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks ++ * range. ++ * 2.1) The setting range size < already set range size ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. ++ * An extra slot from the bad blocks table will be allocated for S, and head ++ * of E will move to end of the inserted range S. The result is, ++ * +--------+----+ ++ * | S | E | ++ * +--------+----+ ++ * 2.2) The setting range size == already set range size ++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of ++ bad blocks range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.3) The setting range size > already set range size ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For such situation, the setting range S can be treated as two parts, the ++ * first part (S1) is as same size as the already set range E, the second ++ * part (S2) is the rest of setting range. ++ * +-------------+-----+ +-------------+ +-----+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +-------------+-----+ ===> +-------------+ +-----+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now we only focus on how to handle the setting range S1 and already set ++ * range E, which are already explained in 2.2), for the rest S2 it will be ++ * handled later in next loop. ++ * 3) A setting range starts before the start LBA of an already set bad blocks ++ * range. ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation, the setting range S can be divided into two parts, the ++ * first (S1) ends at the start LBA of already set range E, the second part ++ * (S2) starts exactly at a start LBA of the already set range E. ++ * +----+---------+ +----+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +----+---------+ ===> +----+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now only the first part S1 should be handled in this loop, which is in ++ * similar condition as 1). The rest part S2 has exact same start LBA address ++ * of the already set range E, they will be handled in next loop in one of ++ * situations in 2). ++ * 4) A setting range starts after the start LBA of an already set bad blocks ++ * range. ++ * 4.1) If the setting range S exactly matches the tail part of already set bad ++ * blocks range E, like the following chart shows, ++ * +---------+ ++ * | S | ++ * +---------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may ++ * overwrite the overlapped range of E, the result is, ++ * +---+---------+ ++ * | E | S | ++ * +---+---------+ ++ * 4.2) If the setting range S stays in middle of an already set range E, like ++ * the following chart shows, ++ * +----+ ++ * | S | ++ * +----+ ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is also, ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will ++ * inserted into middle of E and split previous range E into twp parts (E1 ++ * and E2), the result is, ++ * +----+----+----+ ++ * | E1 | S | E2 | ++ * +----+----+----+ ++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad ++ * blocks range E. The range S starts after the start LBA of range E, and ++ * ends after the end LBA of range E, as the following chart shows, ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation the range S can be divided into two parts, the first ++ * part (S1) ends at end range E, and the second part (S2) has rest range of ++ * origin S. ++ * +---------+---------+ +---------+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +---------+---------+ ===> +---------+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now in this loop the setting range S1 and already set range E can be ++ * handled as the situations 4), the rest range S2 will be handled in next ++ * loop and ignored in this loop. ++ * 5) A setting bad blocks range S is adjacent to one or more already set bad ++ * blocks range(s), and they are all acked or unacked range. ++ * 5.1) Front merge: If the already set bad blocks range E is before setting ++ * range S and they are adjacent, ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can front merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting ++ * range S right after already set range E into the bad blocks table. The ++ * result is, ++ * +--------+------+ ++ * | E | S | ++ * +--------+------+ ++ * 6) Special cases which above conditions cannot handle ++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table ++ * +-------------------------------------------------------+ ++ * | S | ++ * +-------------------------------------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+ +-----+ +-----+ ++ * | E1 | | E2 | | E3 | ++ * +-----+ +-----+ +-----+ ++ * In the above example, when the bad blocks table is full, inserting the ++ * first part of setting range S will fail because no more available slot ++ * can be allocated from bad blocks table. In this situation a proper ++ * setting method should be go though all the setting bad blocks range and ++ * look for chance to merge already set ranges into less ones. When there ++ * is available slot from bad blocks table, re-try again to handle more ++ * setting bad blocks ranges as many as possible. ++ * +------------------------+ ++ * | S3 | ++ * +------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+-----+-----+---+-----+--+ ++ * | S1 | S2 | ++ * +-----+-----+-----+---+-----+--+ ++ * The above chart shows although the first part (S3) cannot be inserted due ++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges ++ * can be merged with rest part of S into less range S1 and S2. Now there is ++ * 1 free slot in bad blocks table. ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * | S3 | S1 | S2 | ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * Since the bad blocks table is not full anymore, re-try again for the ++ * origin setting range S. Now the setting range S3 can be inserted into the ++ * bad blocks table with previous freed slot from multiple ranges merge. ++ * 6.2) Front merge after overwrite ++ * In the following example, in bad blocks table, E1 is an acked bad blocks ++ * range and E2 is an unacked bad blocks range, therefore they are not able ++ * to merge into a larger range. The setting bad blocks range S is acked, ++ * therefore part of E2 can be overwritten by S. ++ * +--------+ ++ * | S | acknowledged ++ * +--------+ S: 1 ++ * +-------+-------------+ E1: 1 ++ * | E1 | E2 | E2: 0 ++ * +-------+-------------+ ++ * With previous simplified routines, after overwriting part of E2 with S, ++ * the bad blocks table should be (E3 is remaining part of E2 which is not ++ * overwritten by S), ++ * acknowledged ++ * +-------+--------+----+ S: 1 ++ * | E1 | S | E3 | E1: 1 ++ * +-------+--------+----+ E3: 0 ++ * The above result is correct but not perfect. Range E1 and S in the bad ++ * blocks table are all acked, merging them into a larger one range may ++ * occupy less bad blocks table space and make badblocks_check() faster. ++ * Therefore in such situation, after overwriting range S, the previous range ++ * E1 should be checked for possible front combination. Then the ideal ++ * result can be, ++ * +----------------+----+ acknowledged ++ * | E1 | E3 | E1: 1 ++ * +----------------+----+ E3: 0 ++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting ++ * range S and they are adjacent. Normally we don't need to care about this ++ * because front merge handles this while going though range S from head to ++ * tail, except for the tail part of range S. When the setting range S are ++ * fully handled, all the above simplified routine doesn't check whether the ++ * tail LBA of range S is adjacent to the next already set range and not able ++ * to them if they are mergeable. ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * For the above special situation, when the setting range S are all handled ++ * and the loop ends, an extra check is necessary for whether next already ++ * set range E is right after S and mergeable. ++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can behind merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range ++ * S in front of the already set range E in the bad blocks table. The result ++ * is, ++ * +------+-------+ ++ * | S | E | ++ * +------+-------+ ++ * ++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of ++ * the bad block range setting conditions. Maybe there is some rare corner case ++ * is not considered and optimized, it won't hurt if badblocks_set() fails due ++ * to no space, or some ranges are not merged to save bad blocks table space. ++ * ++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every ++ * time for the new loop prev_badblocks() is called to find an already set range ++ * which starts before or at current setting range. Since the setting bad blocks ++ * range is handled from head to tail, most of the cases it is unnecessary to do ++ * the binary search inside prev_badblocks(), it is possible to provide a hint ++ * to prev_badblocks() for a fast path, then the expensive binary search can be ++ * avoided. In my test with the hint to prev_badblocks(), except for the first ++ * loop, all rested calls to prev_badblocks() can go into the fast path and ++ * return correct bad blocks table index immediately. ++ */ ++ + /* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad +@@ -392,6 +708,230 @@ static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad + return len; + } + ++static void badblocks_update_acked(struct badblocks *bb) ++{ ++ bool unacked = false; ++ u64 *p = bb->page; ++ int i; ++ ++ if (!bb->unacked_exist) ++ return; ++ ++ for (i = 0; i < bb->count ; i++) { ++ if (!BB_ACK(p[i])) { ++ unacked = true; ++ break; ++ } ++ } ++ ++ if (!unacked) ++ bb->unacked_exist = 0; ++} ++ ++/* Do exact work to set bad block range into the bad block table */ ++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged) ++{ ++ int retried = 0, space_desired = 0; ++ int orig_len, len = 0, added = 0; ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ sector_t orig_start; ++ unsigned long flags; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ /* round the start down, and the end up */ ++ sector_t next = s + sectors; ++ ++ rounddown(s, bb->shift); ++ roundup(next, bb->shift); ++ sectors = next - s; ++ } ++ ++ write_seqlock_irqsave(&bb->lock, flags); ++ ++ orig_start = s; ++ orig_len = sectors; ++ bad.ack = acknowledged; ++ p = bb->page; ++ ++re_insert: ++ bad.start = s; ++ bad.len = sectors; ++ len = 0; ++ ++ if (badblocks_empty(bb)) { ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start before all badblocks */ ++ if (prev < 0) { ++ if (!badblocks_full(bb)) { ++ /* insert on the first */ ++ if (bad.len > (BB_OFFSET(p[0]) - bad.start)) ++ bad.len = BB_OFFSET(p[0]) - bad.start; ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* No sapce, try to merge */ ++ if (overlap_behind(bb, &bad, 0)) { ++ if (can_merge_behind(bb, &bad, 0)) { ++ len = behind_merge(bb, &bad, 0); ++ added++; ++ } else { ++ len = min_t(sector_t, ++ BB_OFFSET(p[0]) - s, sectors); ++ space_desired = 1; ++ } ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* no table space and give up */ ++ goto out; ++ } ++ ++ /* in case p[prev-1] can be merged with p[prev] */ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ } else { ++ int extra = 0; ++ ++ if (!can_front_overwrite(bb, prev, &bad, &extra)) { ++ len = min_t(sector_t, ++ BB_END(p[prev]) - s, sectors); ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ len = front_overwrite(bb, prev, &bad, extra); ++ added++; ++ bb->count += extra; ++ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ } ++ } ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ /* if no space in table, still try to merge in the covered range */ ++ if (badblocks_full(bb)) { ++ /* skip the cannot-merge range */ ++ if (((prev + 1) < bb->count) && ++ overlap_behind(bb, &bad, prev + 1) && ++ ((s + sectors) >= BB_END(p[prev + 1]))) { ++ len = BB_END(p[prev + 1]) - s; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* no retry any more */ ++ len = sectors; ++ space_desired = 1; ++ hint = -1; ++ goto update_sectors; ++ } ++ ++ /* cannot merge and there is space in bad table */ ++ if ((prev + 1) < bb->count && ++ overlap_behind(bb, &bad, prev + 1)) ++ bad.len = min_t(sector_t, ++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start); ++ ++ len = insert_at(bb, prev + 1, &bad); ++ bb->count++; ++ added++; ++ hint = prev + 1; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_insert; ++ ++ WARN_ON(sectors < 0); ++ ++ /* Check whether the following already set range can be merged */ ++ if ((prev + 1) < bb->count && ++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && ++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && ++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), ++ BB_ACK(p[prev])); ++ ++ if ((prev + 2) < bb->count) ++ memmove(p + prev + 1, p + prev + 2, ++ (bb->count - (prev + 2)) * 8); ++ bb->count--; ++ } ++ ++ if (space_desired && !badblocks_full(bb)) { ++ s = orig_start; ++ sectors = orig_len; ++ space_desired = 0; ++ if (retried++ < 3) ++ goto re_insert; ++ } ++ ++out: ++ if (added) { ++ set_changed(bb); ++ ++ if (!acknowledged) ++ bb->unacked_exist = 1; ++ else ++ badblocks_update_acked(bb); ++ } ++ ++ write_sequnlock_irqrestore(&bb->lock, flags); ++ ++ if (!added) ++ rv = 1; ++ ++ return rv; ++} ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +@@ -501,26 +1041,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + } + EXPORT_SYMBOL_GPL(badblocks_check); + +-static void badblocks_update_acked(struct badblocks *bb) +-{ +- u64 *p = bb->page; +- int i; +- bool unacked = false; +- +- if (!bb->unacked_exist) +- return; +- +- for (i = 0; i < bb->count ; i++) { +- if (!BB_ACK(p[i])) { +- unacked = true; +- break; +- } +- } +- +- if (!unacked) +- bb->unacked_exist = 0; +-} +- + /** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch new file mode 100644 index 0000000..e3c38b8 --- /dev/null +++ b/for-test/badblocks/v5/v5-0004-badblocks-improve-badblocks_clear-for-multiple-ra.patch @@ -0,0 +1,399 @@ +From ea2a8ebd59b23e8c12febd3bcf5bebea24d63461 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 1 Mar 2021 22:16:10 +0800 +Subject: [PATCH v5 4/6] badblocks: improve badblocks_clear() for multiple + ranges handling + +With the fundamental ideas and helper routines from badblocks_set() +improvement, clearing bad block for multiple ranges is much simpler. + +With a similar idea from badblocks_set() improvement, this patch +simplifies bad block range clearing into 5 situations. No matter how +complicated the clearing condition is, we just look at the head part +of clearing range with relative already set bad block range from the +bad block table. The rested part will be handled in next run of the +while-loop. + +Based on existing helpers added from badblocks_set(), this patch adds +two more helpers, +- front_clear() + Clear the bad block range from bad block table which is front + overlapped with the clearing range. +- front_splitting_clear() + Handle the condition that the clearing range hits middle of an + already set bad block range from bad block table. + +Similar as badblocks_set(), the first part of clearing range is handled +with relative bad block range which is find by prev_badblocks(). In most +cases a valid hint is provided to prev_badblocks() to avoid unnecessary +bad block table iteration. + +This patch also explains the detail algorithm code comments at beginning +of badblocks.c, including which five simplified situations are +categrized and how all the bad block range clearing conditions are +handled by these five situations. + +Again, in order to make the code review easier and avoid the code +changes mixed together, this patch does not modify badblock_clear() and +implement another routine called _badblock_clear() for the improvement. +Later patch will delete current code of badblock_clear() and make it as +a wrapper to _badblock_clear(), so the code change can be much clear for +review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 325 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index f45f82646bb7..3e1bb593a2bb 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -330,6 +330,123 @@ + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. ++ * ++ * ++ * Clearing a bad blocks range from the bad block table has similar idea as ++ * setting does, but much more simpler. The only thing needs to be noticed is ++ * when the clearing range hits middle of a bad block range, the existing bad ++ * block range will split into two, and one more item should be added into the ++ * bad block table. The simplified situations to be considered are, (The already ++ * set bad blocks ranges in bad block table are naming with prefix E, and the ++ * clearing bad blocks range is naming with prefix C) ++ * ++ * 1) A clearing range is not overlapped to any already set ranges in bad block ++ * table. ++ * +-----+ | +-----+ | +-----+ ++ * | C | | | C | | | C | ++ * +-----+ or +-----+ or +-----+ ++ * +---+ | +----+ +----+ | +---+ ++ * | E | | | E1 | | E2 | | | E | ++ * +---+ | +----+ +----+ | +---+ ++ * For the above situations, no bad block to be cleared and no failure ++ * happens, simply returns 0. ++ * 2) The clearing range hits middle of an already setting bad blocks range in ++ * the bad block table. ++ * +---+ ++ * | C | ++ * +---+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * In this situation if the bad block table is not full, the range E will be ++ * split into two ranges E1 and E2. The result is, ++ * +------+ +------+ ++ * | E1 | | E2 | ++ * +------+ +------+ ++ * 3) The clearing range starts exactly at same LBA as an already set bad block range ++ * from the bad block table. ++ * 3.1) Partially covered at head part ++ * +------------+ ++ * | C | ++ * +------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation, the overlapped already set range will update the ++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No ++ * item deleted from bad block table. The result is, ++ * +----+ ++ * | E1 | ++ * +----+ ++ * 3.2) Exact fully covered ++ * +-----------------+ ++ * | C | ++ * +-----------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation the whole bad blocks range E will be cleared and its ++ * corresponded item is deleted from the bad block table. ++ * 4) The clearing range exactly ends at same LBA as an already set bad block ++ * range. ++ * +-------+ ++ * | C | ++ * +-------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For the above situation, the already set range E is updated to shrink its ++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). ++ * The result is, ++ * +---------+ ++ * | E | ++ * +---------+ ++ * 5) The clearing range is partially overlapped with an already set bad block ++ * range from the bad block table. ++ * 5.1) The already set bad block range is front overlapped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part ends at the start LBA of range E, and the second part starts at ++ * same LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be ++ * handled as condition 3.1) in next loop. ++ * 5.2) The already set bad block range is behind overlaopped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part C1 ends at same end LBA of range E, and the second part starts ++ * at end LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part clearing range C1 can be handled as condition 4), and ++ * the second part clearing range C2 can be handled as condition 1) in next ++ * loop. ++ * ++ * All bad blocks range clearing can be simplified into the above 5 situations ++ * by only handling the head part of the clearing range in each run of the ++ * while-loop. The idea is similar to bad blocks range setting but much ++ * simpler. + */ + + /* +@@ -932,6 +1049,214 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + return rv; + } + ++/* ++ * Clear the bad block range from bad block table which is front overlapped ++ * with the clearing range. The return value is how many sectors from an ++ * already set bad block range are cleared. If the whole bad block range is ++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for ++ * the caller to reduce bb->count. ++ */ ++static int front_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *deleted) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int cleared = 0; ++ ++ *deleted = 0; ++ if (s == BB_OFFSET(p[prev])) { ++ if (BB_LEN(p[prev]) > sectors) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, ++ BB_LEN(p[prev]) - sectors, ++ BB_ACK(p[prev])); ++ cleared = sectors; ++ } else { ++ /* BB_LEN(p[prev]) <= sectors */ ++ cleared = BB_LEN(p[prev]); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ *deleted = 1; ++ } ++ } else if (s > BB_OFFSET(p[prev])) { ++ if (BB_END(p[prev]) <= (s + sectors)) { ++ cleared = BB_END(p[prev]) - s; ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ } else { ++ /* Splitting is handled in front_splitting_clear() */ ++ BUG(); ++ } ++ } ++ ++ return cleared; ++} ++ ++/* ++ * Handle the condition that the clearing range hits middle of an already set ++ * bad block range from bad block table. In this condition the existing bad ++ * block range is split into two after the middle part is cleared. ++ */ ++static int front_splitting_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ u64 end = BB_END(p[prev]); ++ int ack = BB_ACK(p[prev]); ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ ack); ++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); ++ return sectors; ++} ++ ++/* Do the exact work to clear bad block range from the bad block table */ ++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) ++{ ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ int len = 0, cleared = 0; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ sector_t target; ++ ++ /* When clearing we round the start up and the end down. ++ * This should not matter as the shift should align with ++ * the block size and no rounding should ever be needed. ++ * However it is better the think a block is bad when it ++ * isn't than to think a block is not bad when it is. ++ */ ++ target = s + sectors; ++ roundup(s, bb->shift); ++ rounddown(target, bb->shift); ++ sectors = target - s; ++ } ++ ++ write_seqlock_irq(&bb->lock); ++ ++ bad.ack = true; ++ p = bb->page; ++ ++re_clear: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* Start before all badblocks */ ++ if (prev < 0) { ++ if (overlap_behind(bb, &bad, 0)) { ++ len = BB_OFFSET(p[0]) - s; ++ hint = prev; ++ } else { ++ len = sectors; ++ } ++ /* ++ * Both situations are to clear non-bad range, ++ * should be treated as successful ++ */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Clear will split a bad record but the table is full */ ++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + sectors))) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if ((BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + bad.len))) { ++ /* Splitting */ ++ if ((bb->count + 1) < MAX_BADBLOCKS) { ++ len = front_splitting_clear(bb, prev, &bad); ++ bb->count += 1; ++ cleared++; ++ } else { ++ /* No space to split, give up */ ++ len = sectors; ++ } ++ } else { ++ int deleted = 0; ++ ++ len = front_clear(bb, prev, &bad, &deleted); ++ bb->count -= deleted; ++ cleared++; ++ hint = prev; ++ } ++ ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Not cover any badblocks range in the table */ ++ len = sectors; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_clear; ++ ++ WARN_ON(sectors < 0); ++ ++ if (cleared) { ++ badblocks_update_acked(bb); ++ set_changed(bb); ++ } ++ ++ write_sequnlock_irq(&bb->lock); ++ ++ if (!cleared) ++ rv = 1; ++ ++ return rv; ++} ++ ++ + /** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch new file mode 100644 index 0000000..f7ba71a --- /dev/null +++ b/for-test/badblocks/v5/v5-0005-badblocks-improve-badblocks_check-for-multiple-ra.patch @@ -0,0 +1,175 @@ +From 25e6c8d14293c3b45fcf239df7c88e05f1ee70bf Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 2 Dec 2021 16:13:35 +0800 +Subject: [PATCH v5 5/6] badblocks: improve badblocks_check() for multiple + ranges handling + +This patch rewrites badblocks_check() with similar coding style as +_badblocks_set() and _badblocks_clear(). The only difference is bad +blocks checking may handle multiple ranges in bad tables now. + +If a checking range covers multiple bad blocks range in bad block table, +like the following condition (C is the checking range, E1, E2, E3 are +three bad block ranges in bad block table), + +------------------------------------+ + | C | + +------------------------------------+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +The improved badblocks_check() algorithm will divide checking range C +into multiple parts, and handle them in 7 runs of a while-loop, + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + |C1| | C2 | | C3 | | C4 | | C5 | | C6 | | C7 | + +--+ +----+ +----+ +----+ +----+ +----+ +----+ + +----+ +----+ +----+ + | E1 | | E2 | | E3 | + +----+ +----+ +----+ +And the start LBA and length of range E1 will be set as first_bad and +bad_sectors for the caller. + +The return value rule is consistent for multiple ranges. For example if +there are following bad block ranges in bad block table, + Index No. Start Len Ack + 0 400 20 1 + 1 500 50 1 + 2 650 20 0 +the return value, first_bad, bad_sectors by calling badblocks_set() with +different checking range can be the following values, + Checking Start, Len Return Value first_bad bad_sectors + 100, 100 0 N/A N/A + 100, 310 1 400 10 + 100, 440 1 400 10 + 100, 540 1 400 10 + 100, 600 -1 400 10 + 100, 800 -1 400 10 + +In order to make code review easier, this patch names the improved bad +block range checking routine as _badblocks_check() and does not change +existing badblock_check() code yet. Later patch will delete old code of +badblocks_check() and make it as a wrapper to call _badblocks_check(). +Then the new added code won't mess up with the old deleted code, it will +be more clear and easier for code review. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 97 insertions(+) + +diff --git a/block/badblocks.c b/block/badblocks.c +index 3e1bb593a2bb..bfade2434c74 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1256,6 +1256,103 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + return rv; + } + ++/* Do the exact work to check bad blocks range from the bad block table */ ++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors) ++{ ++ int unacked_badblocks, acked_badblocks; ++ int prev = -1, hint = -1, set = 0; ++ struct badblocks_context bad; ++ unsigned int seq; ++ int len, rv; ++ u64 *p; ++ ++ WARN_ON(bb->shift < 0 || sectors == 0); ++ ++ if (bb->shift > 0) { ++ sector_t target; ++ ++ /* round the start down, and the end up */ ++ target = s + sectors; ++ rounddown(s, bb->shift); ++ roundup(target, bb->shift); ++ sectors = target - s; ++ } ++ ++retry: ++ seq = read_seqbegin(&bb->lock); ++ ++ p = bb->page; ++ unacked_badblocks = 0; ++ acked_badblocks = 0; ++ ++re_check: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (BB_ACK(p[prev])) ++ acked_badblocks++; ++ else ++ unacked_badblocks++; ++ ++ if (BB_END(p[prev]) >= (s + sectors)) ++ len = sectors; ++ else ++ len = BB_END(p[prev]) - s; ++ ++ if (set == 0) { ++ *first_bad = BB_OFFSET(p[prev]); ++ *bad_sectors = BB_LEN(p[prev]); ++ set = 1; ++ } ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* not cover any badblocks range in the table */ ++ len = sectors; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_check; ++ ++ WARN_ON(sectors < 0); ++ ++ if (unacked_badblocks > 0) ++ rv = -1; ++ else if (acked_badblocks > 0) ++ rv = 1; ++ else ++ rv = 0; ++ ++ if (read_seqretry(&bb->lock, seq)) ++ goto retry; ++ ++ return rv; ++} + + /** + * badblocks_check() - check a given range for bad sectors +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch new file mode 100644 index 0000000..837c7fe --- /dev/null +++ b/for-test/badblocks/v5/v5-0006-badblocks-switch-to-the-improved-badblock-handlin.patch @@ -0,0 +1,365 @@ +From d1f471dc0f862dfc71d3bbebc60631f83208217f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 2 Mar 2021 10:48:43 +0800 +Subject: [PATCH v5 6/6] badblocks: switch to the improved badblock handling + code + +This patch removes old code of badblocks_set(), badblocks_clear() and +badblocks_check(), and make them as wrappers to call _badblocks_set(), +_badblocks_clear() and _badblocks_check(). + +By this change now the badblock handing switch to the improved algorithm +in _badblocks_set(), _badblocks_clear() and _badblocks_check(). + +This patch only contains the changes of old code deletion, new added +code for the improved algorithms are in previous patches. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + block/badblocks.c | 310 +--------------------------------------------- + 1 file changed, 3 insertions(+), 307 deletions(-) + +diff --git a/block/badblocks.c b/block/badblocks.c +index bfade2434c74..78f2af9295e6 100644 +--- a/block/badblocks.c ++++ b/block/badblocks.c +@@ -1391,75 +1391,7 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) + { +- int hi; +- int lo; +- u64 *p = bb->page; +- int rv; +- sector_t target = s + sectors; +- unsigned seq; +- +- if (bb->shift > 0) { +- /* round the start down, and the end up */ +- s >>= bb->shift; +- target += (1<<bb->shift) - 1; +- target >>= bb->shift; +- sectors = target - s; +- } +- /* 'target' is now the first block after the bad range */ +- +-retry: +- seq = read_seqbegin(&bb->lock); +- lo = 0; +- rv = 0; +- hi = bb->count; +- +- /* Binary search between lo and hi for 'target' +- * i.e. for the last range that starts before 'target' +- */ +- /* INVARIANT: ranges before 'lo' and at-or-after 'hi' +- * are known not to be the last range before target. +- * VARIANT: hi-lo is the number of possible +- * ranges, and decreases until it reaches 1 +- */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- /* This could still be the one, earlier ranges +- * could not. +- */ +- lo = mid; +- else +- /* This and later ranges are definitely out. */ +- hi = mid; +- } +- /* 'lo' might be the last that started before target, but 'hi' isn't */ +- if (hi > lo) { +- /* need to check all range that end after 's' to see if +- * any are unacknowledged. +- */ +- while (lo >= 0 && +- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { +- if (BB_OFFSET(p[lo]) < target) { +- /* starts before the end, and finishes after +- * the start, so they must overlap +- */ +- if (rv != -1 && BB_ACK(p[lo])) +- rv = 1; +- else +- rv = -1; +- *first_bad = BB_OFFSET(p[lo]); +- *bad_sectors = BB_LEN(p[lo]); +- } +- lo--; +- } +- } +- +- if (read_seqretry(&bb->lock, seq)) +- goto retry; +- +- return rv; ++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + } + EXPORT_SYMBOL_GPL(badblocks_check); + +@@ -1481,154 +1413,7 @@ EXPORT_SYMBOL_GPL(badblocks_check); + int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) + { +- u64 *p; +- int lo, hi; +- int rv = 0; +- unsigned long flags; +- +- if (bb->shift < 0) +- /* badblocks are disabled */ +- return 1; +- +- if (bb->shift) { +- /* round the start down, and the end up */ +- sector_t next = s + sectors; +- +- s >>= bb->shift; +- next += (1<<bb->shift) - 1; +- next >>= bb->shift; +- sectors = next - s; +- } +- +- write_seqlock_irqsave(&bb->lock, flags); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts at-or-before 's' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a <= s) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo && BB_OFFSET(p[lo]) > s) +- hi = lo; +- +- if (hi > lo) { +- /* we found a range that might merge with the start +- * of our new range +- */ +- sector_t a = BB_OFFSET(p[lo]); +- sector_t e = a + BB_LEN(p[lo]); +- int ack = BB_ACK(p[lo]); +- +- if (e >= s) { +- /* Yes, we can merge with a previous range */ +- if (s == a && s + sectors >= e) +- /* new range covers old */ +- ack = acknowledged; +- else +- ack = ack && acknowledged; +- +- if (e < s + sectors) +- e = s + sectors; +- if (e - a <= BB_MAX_LEN) { +- p[lo] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- /* does not all fit in one range, +- * make p[lo] maximal +- */ +- if (BB_LEN(p[lo]) != BB_MAX_LEN) +- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- } +- } +- if (sectors && hi < bb->count) { +- /* 'hi' points to the first range that starts after 's'. +- * Maybe we can merge with the start of that range +- */ +- sector_t a = BB_OFFSET(p[hi]); +- sector_t e = a + BB_LEN(p[hi]); +- int ack = BB_ACK(p[hi]); +- +- if (a <= s + sectors) { +- /* merging is possible */ +- if (e <= s + sectors) { +- /* full overlap */ +- e = s + sectors; +- ack = acknowledged; +- } else +- ack = ack && acknowledged; +- +- a = s; +- if (e - a <= BB_MAX_LEN) { +- p[hi] = BB_MAKE(a, e-a, ack); +- s = e; +- } else { +- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); +- s = a + BB_MAX_LEN; +- } +- sectors = e - s; +- lo = hi; +- hi++; +- } +- } +- if (sectors == 0 && hi < bb->count) { +- /* we might be able to combine lo and hi */ +- /* Note: 's' is at the end of 'lo' */ +- sector_t a = BB_OFFSET(p[hi]); +- int lolen = BB_LEN(p[lo]); +- int hilen = BB_LEN(p[hi]); +- int newlen = lolen + hilen - (s - a); +- +- if (s >= a && newlen < BB_MAX_LEN) { +- /* yes, we can combine them */ +- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); +- +- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); +- memmove(p + hi, p + hi + 1, +- (bb->count - hi - 1) * 8); +- bb->count--; +- } +- } +- while (sectors) { +- /* didn't merge (it all). +- * Need to add a range just before 'hi' +- */ +- if (bb->count >= MAX_BADBLOCKS) { +- /* No room for more */ +- rv = 1; +- break; +- } else { +- int this_sectors = sectors; +- +- memmove(p + hi + 1, p + hi, +- (bb->count - hi) * 8); +- bb->count++; +- +- if (this_sectors > BB_MAX_LEN) +- this_sectors = BB_MAX_LEN; +- p[hi] = BB_MAKE(s, this_sectors, acknowledged); +- sectors -= this_sectors; +- s += this_sectors; +- } +- } +- +- bb->changed = 1; +- if (!acknowledged) +- bb->unacked_exist = 1; +- else +- badblocks_update_acked(bb); +- write_sequnlock_irqrestore(&bb->lock, flags); +- +- return rv; ++ return _badblocks_set(bb, s, sectors, acknowledged); + } + EXPORT_SYMBOL_GPL(badblocks_set); + +@@ -1648,96 +1433,7 @@ EXPORT_SYMBOL_GPL(badblocks_set); + */ + int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) + { +- u64 *p; +- int lo, hi; +- sector_t target = s + sectors; +- int rv = 0; +- +- if (bb->shift > 0) { +- /* When clearing we round the start up and the end down. +- * This should not matter as the shift should align with +- * the block size and no rounding should ever be needed. +- * However it is better the think a block is bad when it +- * isn't than to think a block is not bad when it is. +- */ +- s += (1<<bb->shift) - 1; +- s >>= bb->shift; +- target >>= bb->shift; +- sectors = target - s; +- } +- +- write_seqlock_irq(&bb->lock); +- +- p = bb->page; +- lo = 0; +- hi = bb->count; +- /* Find the last range that starts before 'target' */ +- while (hi - lo > 1) { +- int mid = (lo + hi) / 2; +- sector_t a = BB_OFFSET(p[mid]); +- +- if (a < target) +- lo = mid; +- else +- hi = mid; +- } +- if (hi > lo) { +- /* p[lo] is the last range that could overlap the +- * current range. Earlier ranges could also overlap, +- * but only this one can overlap the end of the range. +- */ +- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && +- (BB_OFFSET(p[lo]) < target)) { +- /* Partial overlap, leave the tail of this range */ +- int ack = BB_ACK(p[lo]); +- sector_t a = BB_OFFSET(p[lo]); +- sector_t end = a + BB_LEN(p[lo]); +- +- if (a < s) { +- /* we need to split this range */ +- if (bb->count >= MAX_BADBLOCKS) { +- rv = -ENOSPC; +- goto out; +- } +- memmove(p+lo+1, p+lo, (bb->count - lo) * 8); +- bb->count++; +- p[lo] = BB_MAKE(a, s-a, ack); +- lo++; +- } +- p[lo] = BB_MAKE(target, end - target, ack); +- /* there is no longer an overlap */ +- hi = lo; +- lo--; +- } +- while (lo >= 0 && +- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && +- (BB_OFFSET(p[lo]) < target)) { +- /* This range does overlap */ +- if (BB_OFFSET(p[lo]) < s) { +- /* Keep the early parts of this range. */ +- int ack = BB_ACK(p[lo]); +- sector_t start = BB_OFFSET(p[lo]); +- +- p[lo] = BB_MAKE(start, s - start, ack); +- /* now low doesn't overlap, so.. */ +- break; +- } +- lo--; +- } +- /* 'lo' is strictly before, 'hi' is strictly after, +- * anything between needs to be discarded +- */ +- if (hi - lo > 1) { +- memmove(p+lo+1, p+hi, (bb->count - hi) * 8); +- bb->count -= (hi - lo - 1); +- } +- } +- +- badblocks_update_acked(bb); +- bb->changed = 1; +-out: +- write_sequnlock_irq(&bb->lock); +- return rv; ++ return _badblocks_clear(bb, s, sectors); + } + EXPORT_SYMBOL_GPL(badblocks_clear); + +-- +2.31.1 + diff --git a/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch new file mode 100644 index 0000000..790b136 --- /dev/null +++ b/for-test/badblocks/v5/v5-0007-test-user-space-code-to-test-badblocks-APIs.patch @@ -0,0 +1,2303 @@ +From 249fc077edbeacb388b7aea11f1f2ce4c0a242c5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 10 Dec 2021 14:30:26 +0800 +Subject: [PATCH v5] test: user space code to test badblocks APIs + +This is the user space test code to verifiy badblocks API, not part of +kernel patch, don't review this patch. + +Except for badblocks_show(), the rested code logic for badblocks_set(), +badblocks_clear(), badblocks_check() are identical to the kernel code. + +The basic idea of the testing code follows the following steps, +1) Generate a random bad blocks range (start offset and length), for + random set or clear operation. See write_badblocks_file() for this. +2) Call badblocks_set() or badblocks_clear() APIs, and record the state + in a log file named with seq- prefix. See write_badblocks_log() for + this. +3) Write sectors into dummy disk file for the corresponding bad blocks + range. E.g. the unacknowledged bad blocks setting writes value 1, + the acknowledged bad blocks setting writes value 2, and the clear + setting writes value 0. See _write_diskfile() for this. +4) Compare all bad blocks ranges with the dummy disk file, if the sector + from the dummy disk file has unexpected value against the correspond- + ing bad block range, stop the loop of testing and ask people to do + manual verification from the seq-* log files. verify_badblocks_file() + does the verification. + +With this testing code, most of simple conditions are verified, only the +complicated situations require manual check. + +There are 3 parameters can be modified in this test code, +- MAX_BB_TEST_TRIES + How many times of the bad blocks set/clear and verification loop, the +loop may exit earlier if verify_badblocks_file() encounters unexpected +sector value and requires manual check. +- MAX_SET_SIZE + The max size of random badblocks set range. A larger range may fill +up all 512 badblock slots earlier. +- MAX_CLN_SIZE + The max size of random badblocks clear range. A larger range may +prevent all 512 badblock slots from being full filled. + +Of course the testing code is not perfect, this is the try-best effort +to verify simple conditions of bad blocks setting/clearing with random +generated ranges. For complicated situations, manual check by people are +still necessary. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Geliang Tang <geliang.tang@suse.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: NeilBrown <neilb@suse.de> +Cc: Richard Fan <richard.fan@suse.com> +Cc: Vishal L Verma <vishal.l.verma@intel.com> +--- + Makefile | 4 + + badblocks.c | 2222 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 2226 insertions(+) + create mode 100644 Makefile + create mode 100644 badblocks.c + +diff --git a/Makefile b/Makefile +new file mode 100644 +index 0000000..2287363 +--- /dev/null ++++ b/Makefile +@@ -0,0 +1,4 @@ ++badblocks: badblocks.o ++ gcc -o badblocks -g3 -Wall badblocks.c ++clean: ++ rm -f badblocks badblocks.o +diff --git a/badblocks.c b/badblocks.c +new file mode 100644 +index 0000000..e5b2cd0 +--- /dev/null ++++ b/badblocks.c +@@ -0,0 +1,2222 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Bad block management ++ * ++ * - Heavily based on MD badblocks code from Neil Brown ++ * ++ * Copyright (c) 2015, Intel Corporation. ++ * ++ * Improvement for handling multiple ranges by Coly Li <colyli@suse.de> ++ */ ++ ++#define _GNU_SOURCE /* See feature_test_macros(7) */ ++#include <stdlib.h> ++#include <linux/types.h> ++#include <stdio.h> ++#include <errno.h> ++#include <string.h> ++#include <limits.h> ++#include <assert.h> ++#include <unistd.h> ++#include <sys/types.h> ++#include <sys/stat.h> ++#include <fcntl.h> ++ ++extern int errno; ++ ++#define PAGE_SIZE 4096 ++typedef unsigned long long sector_t; ++typedef unsigned long long u64; ++typedef _Bool bool; ++ ++#define BB_LEN_MASK (0x00000000000001FFULL) ++#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) ++#define BB_ACK_MASK (0x8000000000000000ULL) ++#define BB_MAX_LEN 512 ++#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) ++#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) ++#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x)) ++#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) ++#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) ++ ++/* Bad block numbers are stored in a single page. ++ * 64bits is used for each block or extent. ++ * 54 bits are sector number, 9 bits are extent size, ++ * 1 bit is an 'acknowledged' flag. ++ */ ++#define MAX_BADBLOCKS (PAGE_SIZE/8) ++#define GFP_KERNEL 0 ++#define true 1 ++#define false 0 ++ ++#define WARN_ON(condition) ({ \ ++ if (!!(condition)) \ ++ printf("warning on %s:%d\n", __func__, __LINE__); \ ++}) ++ ++#define BUG() ({printf("BUG on %s:%d\n", __func__, __LINE__); exit(1);}) ++ ++struct device { ++ int val; ++}; ++ ++struct badblocks { ++ struct device *dev; ++ int count; /* count of bad blocks */ ++ int unacked_exist; /* there probably are unacknowledged ++ * bad blocks. This is only cleared ++ * when a read discovers none ++ */ ++ int shift; /* shift from sectors to block size ++ * a -ve shift means badblocks are ++ * disabled.*/ ++ u64 *page; /* badblock list */ ++ int changed; ++ unsigned long lock; ++ sector_t sector; ++ sector_t size; /* in sectors */ ++}; ++ ++struct badblocks_context { ++ sector_t start; ++ sector_t len; ++ sector_t orig_start; ++ sector_t orig_len; ++ int ack; ++ int first_prev; ++}; ++ ++int badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors); ++int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged); ++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); ++void ack_all_badblocks(struct badblocks *bb); ++ssize_t badblocks_show(struct badblocks *bb, int unack); ++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, ++ int unack); ++int badblocks_init(struct badblocks *bb, int enable); ++void badblocks_exit(struct badblocks *bb); ++ ++static inline void* kzalloc(int size, int flag) ++{ ++ void * p = malloc(size); ++ memset(p, 0, size); ++ return p; ++} ++ ++static inline void kfree(void* page) ++{ ++ free(page); ++} ++ ++#define roundup(x, y) ( \ ++{ \ ++ typeof(y) __y = y; \ ++ (((x) + (__y - 1)) / __y) * __y; \ ++} \ ++) ++ ++#define rounddown(x, y) ( \ ++{ \ ++ typeof(x) __x = (x); \ ++ __x - (__x % (y)); \ ++} \ ++) ++ ++#define fallthrough do{}while(0) ++ ++/** ++ * min - return minimum of two values of the same or compatible types ++ * @x: first value ++ * @y: second value ++ */ ++#define min(x, y) ((x) < (y) ? (x) : (y)) ++#define min_t(t, x, y) ((x) < (y) ? (x) : (y)) ++ ++#define write_seqlock_irqsave(_lock, _flags) ((_flags) = *(_lock)) ++#define write_sequnlock_irqrestore(_lock, _flags) ((*(_lock)) = (_flags)) ++#define write_seqlock_irq(lock) do{}while(0) ++#define write_sequnlock_irq(lock) do{}while(0) ++#define read_seqbegin(lock) 1 ++#define read_seqretry(lock, seq) (!!((seq) && 0)) ++#define seqlock_init(lock) do{}while(0) ++#define EXPORT_SYMBOL_GPL(sym) ++ ++static void *devm_kzalloc(struct device *dev, int size, int flags) ++{ ++ void * buf = malloc(size); ++ if (buf) ++ memset(buf, 0, size); ++ return buf; ++} ++ ++static void devm_kfree(struct device *dev, void *mem) ++{ ++ free(mem); ++} ++ ++static inline int badblocks_full(struct badblocks *bb) ++{ ++ return (bb->count >= MAX_BADBLOCKS); ++} ++ ++static inline int badblocks_empty(struct badblocks *bb) ++{ ++ return (bb->count == 0); ++} ++ ++static inline void set_changed(struct badblocks *bb) ++{ ++ if (bb->changed != 1) ++ bb->changed = 1; ++} ++ ++/* ++ * The purpose of badblocks set/clear is to manage bad blocks ranges which are ++ * identified by LBA addresses. ++ * ++ * When the caller of badblocks_set() wants to set a range of bad blocks, the ++ * setting range can be acked or unacked. And the setting range may merge, ++ * overwrite, skip the overlapped already set range, depends on who they are ++ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be ++ * more complicated when the setting range covers multiple already set bad block ++ * ranges, with restrictions of maximum length of each bad range and the bad ++ * table space limitation. ++ * ++ * It is difficult and unnecessary to take care of all the possible situations, ++ * for setting a large range of bad blocks, we can handle it by dividing the ++ * large range into smaller ones when encounter overlap, max range length or ++ * bad table full conditions. Every time only a smaller piece of the bad range ++ * is handled with a limited number of conditions how it is interacted with ++ * possible overlapped or adjacent already set bad block ranges. Then the hard ++ * complicated problem can be much simpler to handle in proper way. ++ * ++ * When setting a range of bad blocks to the bad table, the simplified situations ++ * to be considered are, (The already set bad blocks ranges are naming with ++ * prefix E, and the setting bad blocks range is naming with prefix S) ++ * ++ * 1) A setting range is not overlapped or adjacent to any other already set bad ++ * block range. ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ +-------------+ ++ * | E1 | | E2 | ++ * +-------------+ +-------------+ ++ * For this situation if the bad blocks table is not full, just allocate a ++ * free slot from the bad blocks table to mark the setting range S. The ++ * result is, ++ * +-------------+ +--------+ +-------------+ ++ * | E1 | | S | | E2 | ++ * +-------------+ +--------+ +-------------+ ++ * 2) A setting range starts exactly at a start LBA of an already set bad blocks ++ * range. ++ * 2.1) The setting range size < already set range size ++ * +--------+ ++ * | S | ++ * +--------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. ++ * An extra slot from the bad blocks table will be allocated for S, and head ++ * of E will move to end of the inserted range S. The result is, ++ * +--------+----+ ++ * | S | E | ++ * +--------+----+ ++ * 2.2) The setting range size == already set range size ++ * 2.2.1) If S and E are both acked or unacked range, the setting range S can ++ * be merged into existing bad range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and ++ * the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of ++ bad blocks range E. The result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 2.3) The setting range size > already set range size ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For such situation, the setting range S can be treated as two parts, the ++ * first part (S1) is as same size as the already set range E, the second ++ * part (S2) is the rest of setting range. ++ * +-------------+-----+ +-------------+ +-----+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +-------------+-----+ ===> +-------------+ +-----+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now we only focus on how to handle the setting range S1 and already set ++ * range E, which are already explained in 2.2), for the rest S2 it will be ++ * handled later in next loop. ++ * 3) A setting range starts before the start LBA of an already set bad blocks ++ * range. ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation, the setting range S can be divided into two parts, the ++ * first (S1) ends at the start LBA of already set range E, the second part ++ * (S2) starts exactly at a start LBA of the already set range E. ++ * +----+---------+ +----+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +----+---------+ ===> +----+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now only the first part S1 should be handled in this loop, which is in ++ * similar condition as 1). The rest part S2 has exact same start LBA address ++ * of the already set range E, they will be handled in next loop in one of ++ * situations in 2). ++ * 4) A setting range starts after the start LBA of an already set bad blocks ++ * range. ++ * 4.1) If the setting range S exactly matches the tail part of already set bad ++ * blocks range E, like the following chart shows, ++ * +---------+ ++ * | S | ++ * +---------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +-------------+ ++ * | S | ++ * +-------------+ ++ * 4.1.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is, ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may ++ * overwrite the overlapped range of E, the result is, ++ * +---+---------+ ++ * | E | S | ++ * +---+---------+ ++ * 4.2) If the setting range S stays in middle of an already set range E, like ++ * the following chart shows, ++ * +----+ ++ * | S | ++ * +----+ ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), ++ * they will be merged into one, the result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 4.2.2) If range E is acked and the setting range S is unacked, the setting ++ * request of S will be rejected, the result is also, ++ * +--------------+ ++ * | E | ++ * +--------------+ ++ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will ++ * inserted into middle of E and split previous range E into twp parts (E1 ++ * and E2), the result is, ++ * +----+----+----+ ++ * | E1 | S | E2 | ++ * +----+----+----+ ++ * 4.3) If the setting bad blocks range S is overlapped with an already set bad ++ * blocks range E. The range S starts after the start LBA of range E, and ++ * ends after the end LBA of range E, as the following chart shows, ++ * +-------------------+ ++ * | S | ++ * +-------------------+ ++ * +-------------+ ++ * | E | ++ * +-------------+ ++ * For this situation the range S can be divided into two parts, the first ++ * part (S1) ends at end range E, and the second part (S2) has rest range of ++ * origin S. ++ * +---------+---------+ +---------+ +---------+ ++ * | S1 | S2 | | S1 | | S2 | ++ * +---------+---------+ ===> +---------+ +---------+ ++ * +-------------+ +-------------+ ++ * | E | | E | ++ * +-------------+ +-------------+ ++ * Now in this loop the setting range S1 and already set range E can be ++ * handled as the situations 4), the rest range S2 will be handled in next ++ * loop and ignored in this loop. ++ * 5) A setting bad blocks range S is adjacent to one or more already set bad ++ * blocks range(s), and they are all acked or unacked range. ++ * 5.1) Front merge: If the already set bad blocks range E is before setting ++ * range S and they are adjacent, ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can front merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting ++ * range S right after already set range E into the bad blocks table. The ++ * result is, ++ * +--------+------+ ++ * | E | S | ++ * +--------+------+ ++ * 6) Special cases which above conditions cannot handle ++ * 6.1) Multiple already set ranges may merge into less ones in a full bad table ++ * +-------------------------------------------------------+ ++ * | S | ++ * +-------------------------------------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+ +-----+ +-----+ ++ * | E1 | | E2 | | E3 | ++ * +-----+ +-----+ +-----+ ++ * In the above example, when the bad blocks table is full, inserting the ++ * first part of setting range S will fail because no more available slot ++ * can be allocated from bad blocks table. In this situation a proper ++ * setting method should be go though all the setting bad blocks range and ++ * look for chance to merge already set ranges into less ones. When there ++ * is available slot from bad blocks table, re-try again to handle more ++ * setting bad blocks ranges as many as possible. ++ * +------------------------+ ++ * | S3 | ++ * +------------------------+ ++ * |<----- BB_MAX_LEN ----->| ++ * +-----+-----+-----+---+-----+--+ ++ * | S1 | S2 | ++ * +-----+-----+-----+---+-----+--+ ++ * The above chart shows although the first part (S3) cannot be inserted due ++ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges ++ * can be merged with rest part of S into less range S1 and S2. Now there is ++ * 1 free slot in bad blocks table. ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * | S3 | S1 | S2 | ++ * +------------------------+-----+-----+-----+---+-----+--+ ++ * Since the bad blocks table is not full anymore, re-try again for the ++ * origin setting range S. Now the setting range S3 can be inserted into the ++ * bad blocks table with previous freed slot from multiple ranges merge. ++ * 6.2) Front merge after overwrite ++ * In the following example, in bad blocks table, E1 is an acked bad blocks ++ * range and E2 is an unacked bad blocks range, therefore they are not able ++ * to merge into a larger range. The setting bad blocks range S is acked, ++ * therefore part of E2 can be overwritten by S. ++ * +--------+ ++ * | S | acknowledged ++ * +--------+ S: 1 ++ * +-------+-------------+ E1: 1 ++ * | E1 | E2 | E2: 0 ++ * +-------+-------------+ ++ * With previous simplified routines, after overwriting part of E2 with S, ++ * the bad blocks table should be (E3 is remaining part of E2 which is not ++ * overwritten by S), ++ * acknowledged ++ * +-------+--------+----+ S: 1 ++ * | E1 | S | E3 | E1: 1 ++ * +-------+--------+----+ E3: 0 ++ * The above result is correct but not perfect. Range E1 and S in the bad ++ * blocks table are all acked, merging them into a larger one range may ++ * occupy less bad blocks table space and make badblocks_check() faster. ++ * Therefore in such situation, after overwriting range S, the previous range ++ * E1 should be checked for possible front combination. Then the ideal ++ * result can be, ++ * +----------------+----+ acknowledged ++ * | E1 | E3 | E1: 1 ++ * +----------------+----+ E3: 0 ++ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting ++ * range S and they are adjacent. Normally we don't need to care about this ++ * because front merge handles this while going though range S from head to ++ * tail, except for the tail part of range S. When the setting range S are ++ * fully handled, all the above simplified routine doesn't check whether the ++ * tail LBA of range S is adjacent to the next already set range and not able ++ * to them if they are mergeable. ++ * +------+ ++ * | S | ++ * +------+ ++ * +-------+ ++ * | E | ++ * +-------+ ++ * For the above special situation, when the setting range S are all handled ++ * and the loop ends, an extra check is necessary for whether next already ++ * set range E is right after S and mergeable. ++ * 6.2.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge ++ * values are same, the setting range S can behind merges into range E. The ++ * result is, ++ * +--------------+ ++ * | S | ++ * +--------------+ ++ * 6.2.2) Otherwise these two ranges cannot merge, just insert the setting range ++ * S in front of the already set range E in the bad blocks table. The result ++ * is, ++ * +------+-------+ ++ * | S | E | ++ * +------+-------+ ++ * ++ * All the above 5 simplified situations and 3 special cases may cover 99%+ of ++ * the bad block range setting conditions. Maybe there is some rare corner case ++ * is not considered and optimized, it won't hurt if badblocks_set() fails due ++ * to no space, or some ranges are not merged to save bad blocks table space. ++ * ++ * Inside badblocks_set() each loop starts by jumping to re_insert label, every ++ * time for the new loop prev_badblocks() is called to find an already set range ++ * which starts before or at current setting range. Since the setting bad blocks ++ * range is handled from head to tail, most of the cases it is unnecessary to do ++ * the binary search inside prev_badblocks(), it is possible to provide a hint ++ * to prev_badblocks() for a fast path, then the expensive binary search can be ++ * avoided. In my test with the hint to prev_badblocks(), except for the first ++ * loop, all rested calls to prev_badblocks() can go into the fast path and ++ * return correct bad blocks table index immediately. ++ * ++ * ++ * Clearing a bad blocks range from the bad block table has similar idea as ++ * setting does, but much more simpler. The only thing needs to be noticed is ++ * when the clearing range hits middle of a bad block range, the existing bad ++ * block range will split into two, and one more item should be added into the ++ * bad block table. The simplified situations to be considered are, (The already ++ * set bad blocks ranges in bad block table are naming with prefix E, and the ++ * clearing bad blocks range is naming with prefix C) ++ * ++ * 1) A clearing range is not overlapped to any already set ranges in bad block ++ * table. ++ * +-----+ | +-----+ | +-----+ ++ * | C | | | C | | | C | ++ * +-----+ or +-----+ or +-----+ ++ * +---+ | +----+ +----+ | +---+ ++ * | E | | | E1 | | E2 | | | E | ++ * +---+ | +----+ +----+ | +---+ ++ * For the above situations, no bad block to be cleared and no failure ++ * happens, simply returns 0. ++ * 2) The clearing range hits middle of an already setting bad blocks range in ++ * the bad block table. ++ * +---+ ++ * | C | ++ * +---+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * In this situation if the bad block table is not full, the range E will be ++ * split into two ranges E1 and E2. The result is, ++ * +------+ +------+ ++ * | E1 | | E2 | ++ * +------+ +------+ ++ * 3) The clearing range starts exactly at same LBA as an already set bad block range ++ * from the bad block table. ++ * 3.1) Partially covered at head part ++ * +------------+ ++ * | C | ++ * +------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation, the overlapped already set range will update the ++ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No ++ * item deleted from bad block table. The result is, ++ * +----+ ++ * | E1 | ++ * +----+ ++ * 3.2) Exact fully covered ++ * +-----------------+ ++ * | C | ++ * +-----------------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For this situation the whole bad blocks range E will be cleared and its ++ * corresponded item is deleted from the bad block table. ++ * 4) The clearing range exactly ends at same LBA as an already set bad block ++ * range. ++ * +-------+ ++ * | C | ++ * +-------+ ++ * +-----------------+ ++ * | E | ++ * +-----------------+ ++ * For the above situation, the already set range E is updated to shrink its ++ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). ++ * The result is, ++ * +---------+ ++ * | E | ++ * +---------+ ++ * 5) The clearing range is partially overlapped with an already set bad block ++ * range from the bad block table. ++ * 5.1) The already set bad block range is front overlapped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part ends at the start LBA of range E, and the second part starts at ++ * same LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part C1 can be handled as condition 1), and the second part C2 can be ++ * handled as condition 3.1) in next loop. ++ * 5.2) The already set bad block range is behind overlaopped with the clearing ++ * range. ++ * +----------+ ++ * | C | ++ * +----------+ ++ * +------------+ ++ * | E | ++ * +------------+ ++ * For such situation, the clearing range C can be treated as two parts. The ++ * first part C1 ends at same end LBA of range E, and the second part starts ++ * at end LBA of range E. ++ * +----+-----+ +----+ +-----+ ++ * | C1 | C2 | | C1 | | C2 | ++ * +----+-----+ ===> +----+ +-----+ ++ * +------------+ +------------+ ++ * | E | | E | ++ * +------------+ +------------+ ++ * Now the first part clearing range C1 can be handled as condition 4), and ++ * the second part clearing range C2 can be handled as condition 1) in next ++ * loop. ++ * ++ * All bad blocks range clearing can be simplified into the above 5 situations ++ * by only handling the head part of the clearing range in each run of the ++ * while-loop. The idea is similar to bad blocks range setting but much ++ * simpler. ++ */ ++ ++/* ++ * Find the range starts at-or-before 's' from bad table. The search ++ * starts from index 'hint' and stops at index 'hint_end' from the bad ++ * table. ++ */ ++static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) ++{ ++ int hint_end = hint + 2; ++ u64 *p = bb->page; ++ int ret = -1; ++ ++ while ((hint < hint_end) && ((hint + 1) <= bb->count) && ++ (BB_OFFSET(p[hint]) <= s)) { ++ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { ++ ret = hint; ++ break; ++ } ++ hint++; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Find the range starts at-or-before bad->start. If 'hint' is provided ++ * (hint >= 0) then search in the bad table from hint firstly. It is ++ * very probably the wanted bad range can be found from the hint index, ++ * then the unnecessary while-loop iteration can be avoided. ++ */ ++static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, ++ int hint) ++{ ++ sector_t s = bad->start; ++ int ret = -1; ++ int lo, hi; ++ u64 *p; ++ ++ if (!bb->count) ++ goto out; ++ ++ if (hint >= 0) { ++ ret = prev_by_hint(bb, s, hint); ++ if (ret >= 0) ++ goto out; ++ } ++ ++ lo = 0; ++ hi = bb->count; ++ p = bb->page; ++ ++ while (hi - lo > 1) { ++ int mid = (lo + hi)/2; ++ sector_t a = BB_OFFSET(p[mid]); ++ ++ if (a <= s) ++ lo = mid; ++ else ++ hi = mid; ++ } ++ ++ if (BB_OFFSET(p[lo]) <= s) ++ ret = lo; ++out: ++ return ret; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be backward merged ++ * with the bad range (from the bad table) index by 'behind'. ++ */ ++static bool can_merge_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ ++ if ((s <= BB_OFFSET(p[behind])) && ++ ((s + sectors) >= BB_OFFSET(p[behind])) && ++ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && ++ BB_ACK(p[behind]) == bad->ack) ++ return true; ++ return false; ++} ++ ++/* ++ * Do backward merge for range indicated by 'bad' and the bad range ++ * (from the bad table) indexed by 'behind'. The return value is merged ++ * sectors from bad->len. ++ */ ++static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_OFFSET(p[behind])); ++ WARN_ON((s + sectors) < BB_OFFSET(p[behind])); ++ ++ if (s < BB_OFFSET(p[behind])) { ++ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); ++ ++ merged = min_t(sector_t, sectors, BB_OFFSET(p[behind]) - s); ++ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); ++ } else { ++ merged = min_t(sector_t, sectors, BB_LEN(p[behind])); ++ } ++ ++ WARN_ON(merged == 0); ++ ++ return merged; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can be forward ++ * merged with the bad range (from the bad table) indexed by 'prev'. ++ */ ++static bool can_merge_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ ++ if (BB_ACK(p[prev]) == bad->ack && ++ (s < BB_END(p[prev]) || ++ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) ++ return true; ++ return false; ++} ++ ++/* ++ * Do forward merge for range indicated by 'bad' and the bad range ++ * (from bad table) indexed by 'prev'. The return value is sectors ++ * merged from bad->len. ++ */ ++static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int merged = 0; ++ ++ WARN_ON(s > BB_END(p[prev])); ++ ++ if (s < BB_END(p[prev])) { ++ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); ++ } else { ++ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); ++ if ((prev + 1) < bb->count && ++ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { ++ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); ++ } ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + merged, bad->ack); ++ } ++ ++ return merged; ++} ++ ++/* ++ * 'Combine' is a special case which can_merge_front() is not able to ++ * handle: If a bad range (indexed by 'prev' from bad table) exactly ++ * starts as bad->start, and the bad range ahead of 'prev' (indexed by ++ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and ++ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then ++ * these two bad range (from bad table) can be combined. ++ * ++ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad ++ * table can be combined. ++ */ ++static bool can_combine_front(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if ((prev > 0) && ++ (BB_OFFSET(p[prev]) == bad->start) && ++ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && ++ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && ++ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) ++ return true; ++ return false; ++} ++ ++/* ++ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad ++ * table) into one larger bad range, and the new range is indexed by ++ * 'prev - 1'. ++ */ ++static void front_combine(struct badblocks *bb, int prev) ++{ ++ u64 *p = bb->page; ++ ++ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), ++ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), ++ BB_ACK(p[prev])); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly forward ++ * overlapped with the bad range (from bad table) indexed by 'front'. ++ * Exactly forward overlap means the bad range (from bad table) indexed ++ * by 'prev' does not cover the whole range indicated by 'bad'. ++ */ ++static bool overlap_front(struct badblocks *bb, int front, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start >= BB_OFFSET(p[front]) && ++ bad->start < BB_END(p[front])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' is exactly backward ++ * overlapped with the bad range (from bad table) indexed by 'behind'. ++ */ ++static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, ++ int behind) ++{ ++ u64 *p = bb->page; ++ ++ if (bad->start < BB_OFFSET(p[behind]) && ++ (bad->start + bad->len) > BB_OFFSET(p[behind])) ++ return true; ++ return false; ++} ++ ++/* ++ * Return 'true' if the range indicated by 'bad' can overwrite the bad ++ * range (from bad table) indexed by 'prev'. ++ * ++ * The range indicated by 'bad' can overwrite the bad range indexed by ++ * 'prev' when, ++ * 1) The whole range indicated by 'bad' can cover partial or whole bad ++ * range (from bad table) indexed by 'prev'. ++ * 2) The ack value of 'bad' is larger or equal to the ack value of bad ++ * range 'prev'. ++ * ++ * If the overwriting doesn't cover the whole bad range (from bad table) ++ * indexed by 'prev', new range might be split from existing bad range, ++ * 1) The overwrite covers head or tail part of existing bad range, 1 ++ * extra bad range will be split and added into the bad table. ++ * 2) The overwrite covers middle of existing bad range, 2 extra bad ++ * ranges will be split (ahead and after the overwritten range) and ++ * added into the bad table. ++ * The number of extra split ranges of the overwriting is stored in ++ * 'extra' and returned for the caller. ++ */ ++static bool can_front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *extra) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(!overlap_front(bb, prev, bad)); ++ ++ if (BB_ACK(p[prev]) >= bad->ack) ++ return false; ++ ++ if (BB_END(p[prev]) <= (bad->start + bad->len)) { ++ len = BB_END(p[prev]) - bad->start; ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 0; ++ else ++ *extra = 1; ++ ++ bad->len = len; ++ } else { ++ if (BB_OFFSET(p[prev]) == bad->start) ++ *extra = 1; ++ else ++ /* ++ * prev range will be split into two, beside the overwritten ++ * one, an extra slot needed from bad table. ++ */ ++ *extra = 2; ++ } ++ ++ if ((bb->count + (*extra)) >= MAX_BADBLOCKS) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Do the overwrite from the range indicated by 'bad' to the bad range ++ * (from bad table) indexed by 'prev'. ++ * The previously called can_front_overwrite() will provide how many ++ * extra bad range(s) might be split and added into the bad table. All ++ * the splitting cases in the bad table will be handled here. ++ */ ++static int front_overwrite(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int extra) ++{ ++ u64 *p = bb->page; ++ sector_t orig_end = BB_END(p[prev]); ++ int orig_ack = BB_ACK(p[prev]); ++ ++ switch (extra) { ++ case 0: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), ++ bad->ack); ++ break; ++ case 1: ++ if (BB_OFFSET(p[prev]) == bad->start) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->len, bad->ack); ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start + bad->len, ++ orig_end - BB_END(p[prev]), ++ orig_ack); ++ } else { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ /* ++ * prev +2 -> prev + 1 + 1, which is for, ++ * 1) prev + 1: the slot index of the previous one ++ * 2) + 1: one more slot for extra being 1. ++ */ ++ memmove(p + prev + 2, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ } ++ break; ++ case 2: ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ bad->start - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ /* ++ * prev + 3 -> prev + 1 + 2, which is for, ++ * 1) prev + 1: the slot index of the previous one ++ * 2) + 2: two more slots for extra being 2. ++ */ ++ memmove(p + prev + 3, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); ++ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), ++ orig_end - BB_END(p[prev + 1]), ++ BB_ACK(p[prev])); ++ break; ++ default: ++ break; ++ } ++ ++ return bad->len; ++} ++ ++/* ++ * Explicitly insert a range indicated by 'bad' to the bad table, where ++ * the location is indexed by 'at'. ++ */ ++static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ int len; ++ ++ WARN_ON(badblocks_full(bb)); ++ ++ len = min_t(sector_t, bad->len, BB_MAX_LEN); ++ if (at < bb->count) ++ memmove(p + at + 1, p + at, (bb->count - at) * 8); ++ p[at] = BB_MAKE(bad->start, len, bad->ack); ++ ++ return len; ++} ++ ++static void badblocks_update_acked(struct badblocks *bb) ++{ ++ bool unacked = false; ++ u64 *p = bb->page; ++ int i; ++ ++ if (!bb->unacked_exist) ++ return; ++ ++ for (i = 0; i < bb->count ; i++) { ++ if (!BB_ACK(p[i])) { ++ unacked = true; ++ break; ++ } ++ } ++ ++ if (!unacked) ++ bb->unacked_exist = 0; ++} ++ ++/* Do exact work to set bad block range into the bad block table */ ++static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged) ++{ ++ int retried = 0, space_desired = 0; ++ int orig_len, len = 0, added = 0; ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ sector_t orig_start; ++ unsigned long flags; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ /* round the start down, and the end up */ ++ sector_t next = s + sectors; ++ ++ rounddown(s, bb->shift); ++ roundup(next, bb->shift); ++ sectors = next - s; ++ } ++ ++ write_seqlock_irqsave(&bb->lock, flags); ++ ++ orig_start = s; ++ orig_len = sectors; ++ bad.ack = acknowledged; ++ p = bb->page; ++ ++re_insert: ++ bad.start = s; ++ bad.len = sectors; ++ len = 0; ++ ++ if (badblocks_empty(bb)) { ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start before all badblocks */ ++ if (prev < 0) { ++ if (!badblocks_full(bb)) { ++ /* insert on the first */ ++ if (bad.len > (BB_OFFSET(p[0]) - bad.start)) ++ bad.len = BB_OFFSET(p[0]) - bad.start; ++ len = insert_at(bb, 0, &bad); ++ bb->count++; ++ added++; ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* No sapce, try to merge */ ++ if (overlap_behind(bb, &bad, 0)) { ++ if (can_merge_behind(bb, &bad, 0)) { ++ len = behind_merge(bb, &bad, 0); ++ added++; ++ } else { ++ len = min_t(sector_t, ++ BB_OFFSET(p[0]) - s, sectors); ++ space_desired = 1; ++ } ++ hint = 0; ++ goto update_sectors; ++ } ++ ++ /* no table space and give up */ ++ goto out; ++ } ++ ++ /* in case p[prev-1] can be merged with p[prev] */ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ } else { ++ int extra = 0; ++ ++ if (!can_front_overwrite(bb, prev, &bad, &extra)) { ++ len = min_t(sector_t, ++ BB_END(p[prev]) - s, sectors); ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ len = front_overwrite(bb, prev, &bad, extra); ++ added++; ++ bb->count += extra; ++ ++ if (can_combine_front(bb, prev, &bad)) { ++ front_combine(bb, prev); ++ bb->count--; ++ } ++ } ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ if (can_merge_front(bb, prev, &bad)) { ++ len = front_merge(bb, prev, &bad); ++ added++; ++ hint = prev; ++ goto update_sectors; ++ } ++ ++ /* if no space in table, still try to merge in the covered range */ ++ if (badblocks_full(bb)) { ++ /* skip the cannot-merge range */ ++ if (((prev + 1) < bb->count) && ++ overlap_behind(bb, &bad, prev + 1) && ++ ((s + sectors) >= BB_END(p[prev + 1]))) { ++ len = BB_END(p[prev + 1]) - s; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* no retry any more */ ++ len = sectors; ++ space_desired = 1; ++ hint = -1; ++ goto update_sectors; ++ } ++ ++ /* cannot merge and there is space in bad table */ ++ if ((prev + 1) < bb->count && ++ overlap_behind(bb, &bad, prev + 1)) ++ bad.len = min_t(sector_t, ++ bad.len, BB_OFFSET(p[prev + 1]) - bad.start); ++ ++ len = insert_at(bb, prev + 1, &bad); ++ bb->count++; ++ added++; ++ hint = prev + 1; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_insert; ++ ++ WARN_ON(sectors < 0); ++ ++ /* Check whether the following already set range can be merged */ ++ if ((prev + 1) < bb->count && ++ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && ++ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && ++ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), ++ BB_ACK(p[prev])); ++ ++ if ((prev + 2) < bb->count) ++ memmove(p + prev + 1, p + prev + 2, ++ (bb->count - (prev + 2)) * 8); ++ bb->count--; ++ } ++ ++ if (space_desired && !badblocks_full(bb)) { ++ s = orig_start; ++ sectors = orig_len; ++ space_desired = 0; ++ if (retried++ < 3) ++ goto re_insert; ++ } ++ ++out: ++ if (added) { ++ set_changed(bb); ++ ++ if (!acknowledged) ++ bb->unacked_exist = 1; ++ else ++ badblocks_update_acked(bb); ++ } ++ ++ write_sequnlock_irqrestore(&bb->lock, flags); ++ ++ if (!added) ++ rv = 1; ++ ++ return rv; ++} ++ ++/* ++ * Clear the bad block range from bad block table which is front overlapped ++ * with the clearing range. The return value is how many sectors from an ++ * already set bad block range are cleared. If the whole bad block range is ++ * covered by the clearing range and fully cleared, 'delete' is set as 1 for ++ * the caller to reduce bb->count. ++ */ ++static int front_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad, int *deleted) ++{ ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ u64 *p = bb->page; ++ int cleared = 0; ++ ++ *deleted = 0; ++ if (s == BB_OFFSET(p[prev])) { ++ if (BB_LEN(p[prev]) > sectors) { ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, ++ BB_LEN(p[prev]) - sectors, ++ BB_ACK(p[prev])); ++ cleared = sectors; ++ } else { ++ /* BB_LEN(p[prev]) <= sectors */ ++ cleared = BB_LEN(p[prev]); ++ if ((prev + 1) < bb->count) ++ memmove(p + prev, p + prev + 1, ++ (bb->count - prev - 1) * 8); ++ *deleted = 1; ++ } ++ } else if (s > BB_OFFSET(p[prev])) { ++ if (BB_END(p[prev]) <= (s + sectors)) { ++ cleared = BB_END(p[prev]) - s; ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ BB_ACK(p[prev])); ++ } else { ++ /* Splitting is handled in front_splitting_clear() */ ++ BUG(); ++ } ++ } ++ ++ return cleared; ++} ++ ++/* ++ * Handle the condition that the clearing range hits middle of an already set ++ * bad block range from bad block table. In this condition the existing bad ++ * block range is split into two after the middle part is cleared. ++ */ ++static int front_splitting_clear(struct badblocks *bb, int prev, ++ struct badblocks_context *bad) ++{ ++ u64 *p = bb->page; ++ u64 end = BB_END(p[prev]); ++ int ack = BB_ACK(p[prev]); ++ sector_t sectors = bad->len; ++ sector_t s = bad->start; ++ ++ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), ++ s - BB_OFFSET(p[prev]), ++ ack); ++ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); ++ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); ++ return sectors; ++} ++ ++/* Do the exact work to clear bad block range from the bad block table */ ++static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) ++{ ++ struct badblocks_context bad; ++ int prev = -1, hint = -1; ++ int len = 0, cleared = 0; ++ int rv = 0; ++ u64 *p; ++ ++ if (bb->shift < 0) ++ /* badblocks are disabled */ ++ return 1; ++ ++ if (sectors == 0) ++ /* Invalid sectors number */ ++ return 1; ++ ++ if (bb->shift) { ++ sector_t target; ++ ++ /* When clearing we round the start up and the end down. ++ * This should not matter as the shift should align with ++ * the block size and no rounding should ever be needed. ++ * However it is better the think a block is bad when it ++ * isn't than to think a block is not bad when it is. ++ */ ++ target = s + sectors; ++ roundup(s, bb->shift); ++ rounddown(target, bb->shift); ++ sectors = target - s; ++ } ++ ++ write_seqlock_irq(&bb->lock); ++ ++ bad.ack = true; ++ p = bb->page; ++ ++re_clear: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* Start before all badblocks */ ++ if (prev < 0) { ++ if (overlap_behind(bb, &bad, 0)) { ++ len = BB_OFFSET(p[0]) - s; ++ hint = prev; ++ } else { ++ len = sectors; ++ } ++ /* ++ * Both situations are to clear non-bad range, ++ * should be treated as successful ++ */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Clear will split a bad record but the table is full */ ++ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + sectors))) { ++ len = sectors; ++ printf("Warn: no space to split for clear\n"); ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if ((BB_OFFSET(p[prev]) < bad.start) && ++ (BB_END(p[prev]) > (bad.start + bad.len))) { ++ /* Splitting */ ++ if ((bb->count + 1) < MAX_BADBLOCKS) { ++ len = front_splitting_clear(bb, prev, &bad); ++ bb->count += 1; ++ cleared++; ++ } else { ++ /* No space to split, give up */ ++ printf("Warn: no space to split for clear\n"); ++ len = sectors; ++ } ++ } else { ++ int deleted = 0; ++ ++ len = front_clear(bb, prev, &bad, &deleted); ++ bb->count -= deleted; ++ cleared++; ++ hint = prev; ++ } ++ ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ goto update_sectors; ++ } ++ ++ /* Not cover any badblocks range in the table */ ++ len = sectors; ++ /* Clear non-bad range should be treated as successful */ ++ cleared++; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_clear; ++ ++ WARN_ON(sectors < 0); ++ ++ if (cleared) { ++ badblocks_update_acked(bb); ++ set_changed(bb); ++ } ++ ++ write_sequnlock_irq(&bb->lock); ++ ++ if (!cleared) ++ rv = 1; ++ ++ return rv; ++} ++ ++/* Do the exact work to check bad blocks range from the bad block table */ ++static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors) ++{ ++ int unacked_badblocks, acked_badblocks; ++ int prev = -1, hint = -1, set = 0; ++ struct badblocks_context bad; ++ unsigned int seq; ++ int len, rv; ++ u64 *p; ++ ++ WARN_ON(bb->shift < 0 || sectors == 0); ++ ++ if (bb->shift > 0) { ++ sector_t target; ++ ++ /* round the start down, and the end up */ ++ target = s + sectors; ++ rounddown(s, bb->shift); ++ roundup(target, bb->shift); ++ sectors = target - s; ++ } ++ ++retry: ++ seq = read_seqbegin(&bb->lock); ++ ++ p = bb->page; ++ unacked_badblocks = 0; ++ acked_badblocks = 0; ++ ++re_check: ++ bad.start = s; ++ bad.len = sectors; ++ ++ if (badblocks_empty(bb)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ prev = prev_badblocks(bb, &bad, hint); ++ ++ /* start after all badblocks */ ++ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { ++ len = sectors; ++ goto update_sectors; ++ } ++ ++ if (overlap_front(bb, prev, &bad)) { ++ if (BB_ACK(p[prev])) ++ acked_badblocks++; ++ else ++ unacked_badblocks++; ++ ++ if (BB_END(p[prev]) >= (s + sectors)) ++ len = sectors; ++ else ++ len = BB_END(p[prev]) - s; ++ ++ if (set == 0) { ++ *first_bad = BB_OFFSET(p[prev]); ++ *bad_sectors = BB_LEN(p[prev]); ++ set = 1; ++ } ++ goto update_sectors; ++ } ++ ++ /* Not front overlap, but behind overlap */ ++ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { ++ len = BB_OFFSET(p[prev + 1]) - bad.start; ++ hint = prev + 1; ++ goto update_sectors; ++ } ++ ++ /* not cover any badblocks range in the table */ ++ len = sectors; ++ ++update_sectors: ++ s += len; ++ sectors -= len; ++ ++ if (sectors > 0) ++ goto re_check; ++ ++ WARN_ON(sectors < 0); ++ ++ if (unacked_badblocks > 0) ++ rv = -1; ++ else if (acked_badblocks > 0) ++ rv = 1; ++ else ++ rv = 0; ++ ++ if (read_seqretry(&bb->lock, seq)) ++ goto retry; ++ ++ return rv; ++} ++ ++/** ++ * badblocks_check() - check a given range for bad sectors ++ * @bb: the badblocks structure that holds all badblock information ++ * @s: sector (start) at which to check for badblocks ++ * @sectors: number of sectors to check for badblocks ++ * @first_bad: pointer to store location of the first badblock ++ * @bad_sectors: pointer to store number of badblocks after @first_bad ++ * ++ * We can record which blocks on each device are 'bad' and so just ++ * fail those blocks, or that stripe, rather than the whole device. ++ * Entries in the bad-block table are 64bits wide. This comprises: ++ * Length of bad-range, in sectors: 0-511 for lengths 1-512 ++ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) ++ * A 'shift' can be set so that larger blocks are tracked and ++ * consequently larger devices can be covered. ++ * 'Acknowledged' flag - 1 bit. - the most significant bit. ++ * ++ * Locking of the bad-block table uses a seqlock so badblocks_check ++ * might need to retry if it is very unlucky. ++ * We will sometimes want to check for bad blocks in a bi_end_io function, ++ * so we use the write_seqlock_irq variant. ++ * ++ * When looking for a bad block we specify a range and want to ++ * know if any block in the range is bad. So we binary-search ++ * to the last range that starts at-or-before the given endpoint, ++ * (or "before the sector after the target range") ++ * then see if it ends after the given start. ++ * ++ * Return: ++ * 0: there are no known bad blocks in the range ++ * 1: there are known bad block which are all acknowledged ++ * -1: there are bad blocks which have not yet been acknowledged in metadata. ++ * plus the start/length of the first bad section we overlap. ++ */ ++int badblocks_check(struct badblocks *bb, sector_t s, int sectors, ++ sector_t *first_bad, int *bad_sectors) ++{ ++ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); ++} ++EXPORT_SYMBOL_GPL(badblocks_check); ++ ++/** ++ * badblocks_set() - Add a range of bad blocks to the table. ++ * @bb: the badblocks structure that holds all badblock information ++ * @s: first sector to mark as bad ++ * @sectors: number of sectors to mark as bad ++ * @acknowledged: weather to mark the bad sectors as acknowledged ++ * ++ * This might extend the table, or might contract it if two adjacent ranges ++ * can be merged. We binary-search to find the 'insertion' point, then ++ * decide how best to handle it. ++ * ++ * Return: ++ * 0: success ++ * 1: failed to set badblocks (out of space) ++ */ ++int badblocks_set(struct badblocks *bb, sector_t s, int sectors, ++ int acknowledged) ++{ ++ return _badblocks_set(bb, s, sectors, acknowledged); ++} ++EXPORT_SYMBOL_GPL(badblocks_set); ++ ++/** ++ * badblocks_clear() - Remove a range of bad blocks to the table. ++ * @bb: the badblocks structure that holds all badblock information ++ * @s: first sector to mark as bad ++ * @sectors: number of sectors to mark as bad ++ * ++ * This may involve extending the table if we spilt a region, ++ * but it must not fail. So if the table becomes full, we just ++ * drop the remove request. ++ * ++ * Return: ++ * 0: success ++ * 1: failed to clear badblocks ++ */ ++int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) ++{ ++ return _badblocks_clear(bb, s, sectors); ++} ++EXPORT_SYMBOL_GPL(badblocks_clear); ++ ++/** ++ * ack_all_badblocks() - Acknowledge all bad blocks in a list. ++ * @bb: the badblocks structure that holds all badblock information ++ * ++ * This only succeeds if ->changed is clear. It is used by ++ * in-kernel metadata updates ++ */ ++void ack_all_badblocks(struct badblocks *bb) ++{ ++ if (bb->page == NULL || bb->changed) ++ /* no point even trying */ ++ return; ++ write_seqlock_irq(&bb->lock); ++ ++ if (bb->changed == 0 && bb->unacked_exist) { ++ u64 *p = bb->page; ++ int i; ++ ++ for (i = 0; i < bb->count ; i++) { ++ if (!BB_ACK(p[i])) { ++ sector_t start = BB_OFFSET(p[i]); ++ int len = BB_LEN(p[i]); ++ ++ p[i] = BB_MAKE(start, len, 1); ++ } ++ } ++ bb->unacked_exist = 0; ++ } ++ write_sequnlock_irq(&bb->lock); ++} ++EXPORT_SYMBOL_GPL(ack_all_badblocks); ++ ++/** ++ * badblocks_show() - sysfs access to bad-blocks list ++ * @bb: the badblocks structure that holds all badblock information ++ * @page: buffer received from sysfs ++ * @unack: weather to show unacknowledged badblocks ++ * ++ * Return: ++ * Length of returned data ++ */ ++ssize_t badblocks_show(struct badblocks *bb, int unack) ++{ ++ size_t len; ++ int i; ++ u64 *p = bb->page; ++ char * _page; ++ int size = 64*4096; ++ unsigned seq; ++ ++ if (bb->shift < 0) ++ return 0; ++ ++ _page = malloc(size); ++ if (!_page) { ++ printf("alloc _page failed\n"); ++ return 0; ++ } ++ memset(_page, 0, size); ++retry: ++ seq = read_seqbegin(&bb->lock); ++ ++ len = 0; ++ i = 0; ++ ++ while (len < size&& i < bb->count) { ++ sector_t s = BB_OFFSET(p[i]); ++ unsigned int length = BB_LEN(p[i]); ++ int ack = BB_ACK(p[i]); ++ ++ i++; ++ ++ if (unack && ack) ++ continue; ++ ++ len += snprintf(_page+len, size - len, "%llu %u\n", ++ (unsigned long long)s << bb->shift, ++ length << bb->shift); ++ } ++ if (unack && len == 0) ++ bb->unacked_exist = 0; ++ ++ printf("%s\n", _page); ++ free(_page); ++ ++ if (read_seqretry(&bb->lock, seq)) ++ goto retry; ++ ++ return len; ++} ++EXPORT_SYMBOL_GPL(badblocks_show); ++ ++/** ++ * badblocks_store() - sysfs access to bad-blocks list ++ * @bb: the badblocks structure that holds all badblock information ++ * @page: buffer received from sysfs ++ * @len: length of data received from sysfs ++ * @unack: weather to show unacknowledged badblocks ++ * ++ * Return: ++ * Length of the buffer processed or -ve error. ++ */ ++ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, ++ int unack) ++{ ++ unsigned long long sector; ++ int length; ++ char newline; ++ ++ switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { ++ case 3: ++ if (newline != '\n') ++ return -EINVAL; ++ fallthrough; ++ case 2: ++ if (length <= 0) ++ return -EINVAL; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ if (badblocks_set(bb, sector, length, !unack)) ++ return -ENOSPC; ++ else ++ return len; ++} ++EXPORT_SYMBOL_GPL(badblocks_store); ++ ++static int __badblocks_init(struct device *dev, struct badblocks *bb, ++ int enable) ++{ ++ bb->dev = dev; ++ bb->count = 0; ++ if (enable) ++ bb->shift = 0; ++ else ++ bb->shift = -1; ++ if (dev) ++ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); ++ else ++ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!bb->page) { ++ bb->shift = -1; ++ return -ENOMEM; ++ } ++ seqlock_init(&bb->lock); ++ ++ return 0; ++} ++ ++/** ++ * badblocks_init() - initialize the badblocks structure ++ * @bb: the badblocks structure that holds all badblock information ++ * @enable: weather to enable badblocks accounting ++ * ++ * Return: ++ * 0: success ++ * -ve errno: on error ++ */ ++int badblocks_init(struct badblocks *bb, int enable) ++{ ++ return __badblocks_init(NULL, bb, enable); ++} ++EXPORT_SYMBOL_GPL(badblocks_init); ++ ++int devm_init_badblocks(struct device *dev, struct badblocks *bb) ++{ ++ if (!bb) ++ return -EINVAL; ++ return __badblocks_init(dev, bb, 1); ++} ++EXPORT_SYMBOL_GPL(devm_init_badblocks); ++ ++/** ++ * badblocks_exit() - free the badblocks structure ++ * @bb: the badblocks structure that holds all badblock information ++ */ ++void badblocks_exit(struct badblocks *bb) ++{ ++ if (!bb) ++ return; ++ if (bb->dev) ++ devm_kfree(bb->dev, bb->page); ++ else ++ kfree(bb->page); ++ bb->page = NULL; ++} ++EXPORT_SYMBOL_GPL(badblocks_exit); ++ ++ ++/* ++ * Test case related ++ */ ++char good_sector[512]; ++char bad_unack_sector[512]; ++char bad_acked_sector[512]; ++ ++#define BB_SET 0 ++#define BB_CLN 1 ++ ++unsigned rand_seed = 2; ++ ++char bb_ops[] = {0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1}; ++char bb_ack[] = {1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0}; ++ ++/* disk file lengh is 256MB */ ++#define DISKFILE_SECTORS ((256 << 20) >> 9) ++#define MAX_SET_SIZE (DISKFILE_SECTORS/256) ++#define MAX_CLN_SIZE (DISKFILE_SECTORS/1024) ++ ++#define BUF_LEN (8<<10) ++ ++void write_badblocks_log(struct badblocks *bb, char *dir, unsigned long seq, ++ sector_t bb_start, sector_t bb_len, ++ int ops, int ack) ++{ ++ char path[512]; ++ char buf[8192]; ++ u64 *p = bb->page; ++ int len, size, i; ++ int fd; ++ ++ ++ size = sizeof(buf); ++ memset(buf, 0, sizeof(buf)); ++ len = 0; ++ ++ len += snprintf(buf + len, size - len, "============ %lu ============\n\n", seq); ++ if (ops == BB_SET) ++ len += snprintf(buf + len, size - len, "set: start %llu, len %llu, ack %d\n", ++ bb_start, bb_len, ack); ++ else ++ len += snprintf(buf + len, size - len, "clear: start %llu, len %llu\n", ++ bb_start, bb_len); ++ ++ len += snprintf(buf + len, size - len, "=============================\n\n"); ++ ++ i = 0; ++ while (len < size && i < bb->count) { ++ sector_t s = BB_OFFSET(p[i]); ++ unsigned int length = BB_LEN(p[i]); ++ int ack = BB_ACK(p[i]); ++ ++ i++; ++ ++ len += snprintf(buf + len, size - len, "%llu %u [%u]\n", ++ (unsigned long long)s << bb->shift, ++ length << bb->shift, ++ ack); ++ } ++ ++ snprintf(path, 512, "%s/seq-%.8lu", dir ? dir : ".", seq); ++ unlink(path); ++ fd = open(path, O_CREAT|O_RDWR, 0644); ++ if (fd < 0) { ++ printf("fail to create file %s\n", path); ++ return; ++ } ++ write(fd, buf, len); ++ fsync(fd); ++ close(fd); ++} ++ ++ ++int verify_bad_sectors(sector_t start, sector_t len, int expected, int fd) ++{ ++ int ret = 0; ++ char buf[BUF_LEN]; ++ unsigned long offset = start << 9; ++ unsigned long unread = len << 9; ++ ++ if ((start + len) > DISKFILE_SECTORS) ++ printf("Error: invalid verify range: s %llu, l %llu\n, limit %u\n", ++ start, len, DISKFILE_SECTORS); ++ ++ while (unread > 0) { ++ unsigned long read_bytes = min(unread, BUF_LEN); ++ unsigned long i; ++ ssize_t _ret; ++ ++ memset(buf, 0, sizeof(buf)); ++ _ret = pread(fd, buf, read_bytes, offset); ++ if (_ret != read_bytes) { ++ printf("Error: to read %lu bytes, return %lu bytes\n", ++ read_bytes, _ret); ++ } ++ ++ for (i = 0; i < read_bytes; i++) { ++ if (buf[i] != expected) { ++ printf("Unexpected sector value %u (should be %u) at sector %lu" ++ " offset byte %lu\n", ++ buf[i], expected, (offset+i) >> 9, ++ (offset + i) % 512); ++ exit(1); ++ if (ret == 0) ++ ret = -EIO; ++ } ++ } ++ ++ if (ret) ++ goto out; ++ ++ unread -= read_bytes; ++ offset += read_bytes; ++ } ++ ++out: ++ return ret; ++} ++ ++int verify_badblocks_file(struct badblocks *bb, int fd, unsigned long seq) ++{ ++ int ret = 0; ++ sector_t size = DISKFILE_SECTORS; ++ u64 *p = bb->page; ++ int i = 0; ++ unsigned long prev_pos, pos; ++ ++ prev_pos = pos = 0; ++ while ((size > 0) && (i < bb->count)) { ++ sector_t s = BB_OFFSET(p[i]); ++ unsigned int length = BB_LEN(p[i]); ++ int ack = BB_ACK(p[i]); ++ ++ pos = s; ++ ++ /* verify non-bad area */ ++ if (pos > prev_pos) { ++ ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd); ++ if (ret < 0) { ++ printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n", ++ __func__, __LINE__, prev_pos, pos, strerror(-ret)); ++ goto out; ++ } ++ ++ size -= (pos - prev_pos); ++ } ++ ++ /* verify bad area */ ++ ret = verify_bad_sectors(pos, length, ack ? 2 : 1, fd); ++ if (ret < 0) { ++ printf("%s:%d fail to verify bad sectors [%lu, %u) ack %d, error: %s\n", ++ __func__, __LINE__, pos, length, ack, strerror(ret)); ++ goto out; ++ } ++ ++ size -= length; ++ i++; ++ prev_pos = pos + length; ++ } ++ ++ if (i < bb->count) { ++ printf("Error: total %d bad records, verified %d, left %d\n", ++ bb->count, i, bb->count - i); ++ if (size) ++ printf("Error: still have %llu sectors not verified\n", ++ size); ++ ret = -EIO; ++ goto out; ++ } ++ ++ /* verify rested non-bad area */ ++ if (size) { ++ pos = DISKFILE_SECTORS; ++ ret = verify_bad_sectors(prev_pos, pos - prev_pos, 0, fd); ++ if (ret < 0) { ++ printf("%s:%d fail to verify good sectors [%lu, %lu), error: %s\n", ++ __func__, __LINE__, prev_pos, pos, strerror(-ret)); ++ goto out; ++ } ++ } ++ ++ printf("verify badblocks file successfully (seq %lu)\n", seq); ++out: ++ return ret; ++} ++ ++ ++int _write_diskfile(int fd, int ops, ++ sector_t start, sector_t len, int ack) ++{ ++ off_t pos = start << 9; ++ char sector[512]; ++ ++ if ((start + len) > DISKFILE_SECTORS) ++ len = DISKFILE_SECTORS - start; ++ ++ if (len == 0) { ++ printf("Error: write diskfile zero-length at %llu len %llu\n", ++ start, len); ++ return -EINVAL; ++ } ++ ++ if (ops == BB_CLN) { ++ while (len > 0) { ++ pwrite(fd, good_sector, 512, pos); ++ pos += 512; ++ len--; ++ } ++ fsync(fd); ++ return 0; ++ } ++ ++ /* badblocks set */ ++ while (len > 0) { ++ pread(fd, sector, 512, pos); ++ if (!memcmp(sector, good_sector, 512)) { ++ if (ack) ++ pwrite(fd, bad_acked_sector, 512, pos); ++ else ++ pwrite(fd, bad_unack_sector, 512, pos); ++ ++// printf("write %d at sector %lu\n", ack ? 2 : 1, pos >> 9); ++ } else if (!memcmp(sector, bad_unack_sector, 512)) { ++ if (ack) { ++ pwrite(fd, bad_acked_sector, 512, pos); ++// printf("overwrite 2 at unack sector %lu\n", pos >> 9); ++ } else { ++// printf("avoid overwrite already unacked sector %lu\n", pos >> 9); ++ } ++ } else if (!memcmp(sector, bad_acked_sector, 512)) { ++// if (ack) ++// printf("avoid overwrite already acked sector %lu\n", pos >> 9); ++// else ++// printf("cannot overwrite acked sector %lu\n", pos >> 9); ++ } else { ++ printf("Error: unexpected sector at %lu\n", pos >> 9); ++ } ++ ++ pos += 512; ++ len--; ++ } ++ ++ fsync(fd); ++ return 0; ++} ++ ++sector_t fix_writing_length(struct badblocks*bb, int ops, sector_t bb_start, ++ sector_t bb_len, int ack) ++{ ++ sector_t orig_len = bb_len; ++ sector_t ret_len = 0; ++ int prev; ++ struct badblocks_context bad; ++ u64 *p = bb->page; ++ ++ bad.orig_start = bb_start; ++ bad.orig_len = bb_len; ++ bad.start = bb_start; ++ bad.len = bb_len; ++ bad.ack = ack; ++ ++ ++ if (ops == BB_SET) { ++ prev = prev_badblocks(bb, &bad, -1); ++ if (prev < 0) { ++ printf("Unexpected: the set range is not in badblocks table\n"); ++ exit(1); ++ } ++ ++ if (BB_OFFSET(p[prev]) > bb_start || ++ BB_END(p[prev]) <= bb_start || ++ BB_ACK(p[prev]) != ack) { ++ printf("Unexpected: fixing range is not in badblocks table\n"); ++ exit(1); ++ } ++ ++ while (bb_len > 0) { ++ int seg; ++ ++ if (BB_END(p[prev]) >= (bb_start + bb_len)) ++ seg = bb_len; ++ else ++ seg = BB_END(p[prev]) - bb_start; ++ ++ ret_len += seg; ++ bb_start += seg; ++ bb_len -= seg; ++ ++ if (bb_len == 0) ++ break; ++ ++ if ((prev + 1) >= bb->count || ++ BB_END(p[prev]) != BB_OFFSET(p[prev + 1]) || ++ BB_ACK(p[prev]) != BB_ACK(p[prev + 1])) ++ break; ++ prev++; ++ } ++ } else if (ops == BB_CLN) { ++ ret_len = bb_len; ++ ++ } ++ ++ printf("Fix writing bb_len from %llu to %llu\n", orig_len, ret_len); ++ return ret_len; ++} ++ ++int write_badblocks_file(struct badblocks *bb, unsigned long seq, int fd) ++{ ++ int ret; ++ sector_t bb_start, bb_len; ++ int ops, random; ++ ++retry: ++ random = rand_r(&rand_seed); ++ ops = bb_ops[random % sizeof(bb_ops)]; ++ random = rand_r(&rand_seed); ++ if (ops == BB_SET) ++ bb_len = random % MAX_SET_SIZE; ++ else ++ bb_len= random % MAX_CLN_SIZE; ++ random = rand_r(&rand_seed); ++ bb_start = random % DISKFILE_SECTORS; ++ if ((bb_start + bb_len) > DISKFILE_SECTORS) ++ bb_len = DISKFILE_SECTORS - bb_start; ++ if (bb_len == 0) { ++ printf("random bb_len is 0, re-generate\n"); ++ goto retry; ++ } ++ ++ ++ if (ops == BB_SET) { ++ int ack; ++ ++ random = rand_r(&rand_seed); ++ ack = bb_ack[random % sizeof(bb_ack)]; ++ ++ bb->changed = 0; ++ ret = badblocks_set(bb, bb_start, bb_len, ack); ++ write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_SET, ack); ++ if (ret > 0) { ++ printf("NOTICE: no space or cannot overwwrite badblocks" ++ " for badblocks_set(s: %llu, l: %llu, a: %d).\n" ++ " Manual check might be necessary if\n" ++ " following verification failed.\n", ++ bb_start, bb_len, ack); ++ return 1; ++ } ++ ++ if (badblocks_full(bb) && bb->changed) ++ bb_len = fix_writing_length(bb, ops, bb_start, bb_len, ack); ++ ret = _write_diskfile(fd, ops, bb_start, bb_len, ack); ++ } else { ++ bb->changed = 0; ++ ret = badblocks_clear(bb, bb_start, bb_len); ++ write_badblocks_log(bb, NULL, seq, bb_start, bb_len, BB_CLN, -1); ++ if (ret > 0) { ++ printf("NOTICE: no space for badblocks_clear(s: %llu, l: %llu)\n" ++ " Manual check might be necessary if\n" ++ " following verification failed.\n", ++ bb_start, bb_len); ++ return 1; ++ } ++ ++ ret = _write_diskfile(fd, ops, bb_start, bb_len, -1); ++ } ++ ++ return ret; ++} ++ ++#define MAX_BB_TEST_TRIES (1<<20) ++int do_test(struct badblocks *bb) ++{ ++ int ret = 0; ++ unsigned long seq; ++ char diskfile_name[] = "./dummy_disk_file"; ++ int diskfile_fd = -1; ++ ++ srand(rand_seed); ++ ++ unlink(diskfile_name); ++ diskfile_fd = open(diskfile_name, O_CREAT|O_RDWR, 0644); ++ if (diskfile_fd < 0) { ++ printf("fail to create %s, error %s\n", ++ diskfile_name, strerror(errno)); ++ goto out; ++ } ++ ret = fallocate(diskfile_fd, FALLOC_FL_ZERO_RANGE, 0, DISKFILE_SECTORS << 9); ++ if (ret < 0) { ++ printf("fail to allocate zero-filled file, error %s\n", ++ strerror(errno)); ++ goto out; ++ } ++ ++ for (seq = 1; seq <= MAX_BB_TEST_TRIES; seq++) { ++ ret = write_badblocks_file(bb, seq, diskfile_fd); ++ if (ret < 0) { ++ printf("fail to generate bad blocks for seq %lu, error %s\n", ++ seq, strerror(-ret)); ++ goto out; ++ } ++ ret = verify_badblocks_file(bb, diskfile_fd, seq); ++ if (ret < 0) { ++ printf("fail to verify bad blocks for seq %lu, error %s\n", ++ seq, strerror(-ret)); ++ } ++ } ++ ++out: ++ if (diskfile_fd >= 0) ++ close(diskfile_fd); ++ return ret; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ struct badblocks bblocks; ++ struct badblocks *bb = &bblocks; ++ int i; ++ ++ for (i = 0; i < 512; i++) { ++ good_sector[i] = 0; ++ bad_unack_sector[i] = 1; ++ bad_acked_sector[i] = 2; ++ } ++ ++ memset(bb, 0, sizeof(struct badblocks)); ++ badblocks_init(bb, 1); ++ ++ do_test(bb); ++ ++ badblocks_exit(bb); ++ return 0; ++} +-- +2.31.1 + diff --git a/for-test/jouranl-deadlock/0001-reserve-journal-space.patch b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch new file mode 100644 index 0000000..81af639 --- /dev/null +++ b/for-test/jouranl-deadlock/0001-reserve-journal-space.patch @@ -0,0 +1,369 @@ +From 120572550c913abcc1054912c8deb29c690ffe93 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 18 Apr 2022 21:55:37 +0800 +Subject: [PATCH 1/2] reserve journal space + +--- + drivers/md/bcache/journal.c | 220 +++++++++++++++++++++++++++++++++--- + drivers/md/bcache/journal.h | 10 ++ + 2 files changed, 214 insertions(+), 16 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7c2ca52ca3e4..5aac20c71b80 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -166,6 +166,85 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int bch_journal_key_reload(struct cache_set *c) ++{ ++ struct cache *ca = c->cache; ++ struct bkey *k = &c->journal.key; ++ struct journal_device *ja = &ca->journal; ++ struct bio *bio = &ja->bio; ++ struct jset *j, *data = c->journal.w[0].data; ++ unsigned int n = 0, offset = 0, used_blocks = 0; ++ unsigned int len, left; ++ sector_t bucket; ++ struct closure cl; ++ int ret = 0; ++ ++ /* load from the latest journal bucket */ ++ bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]); ++ ++ closure_init_stack(&cl); ++ ++ while (offset < ca->sb.bucket_size) { ++reread: ++ left = ca->sb.bucket_size - offset; ++ len = min_t(unsigned int, ++ left, PAGE_SECTORS << JSET_BITS); ++ ++ bio_reset(bio, ca->bdev, REQ_OP_READ); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = &cl; ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(c, bio, &cl); ++ closure_sync(&cl); ++ ++ j = data; ++ while (len) { ++ size_t blocks, bytes = set_bytes(j); ++ ++ if (j->magic != jset_magic(&ca->sb)) ++ goto out; ++ ++ if (bytes > left << 9 || ++ bytes > PAGE_SIZE << JSET_BITS) { ++ pr_err("jset may be correpted: too big"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bytes > len << 9) ++ goto reread; ++ ++ if (j->csum != csum_set(j)) { ++ pr_err("jset may be corrupted: bad csum"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ blocks = set_blocks(j, block_bytes(ca)); ++ used_blocks += blocks; ++ ++ offset += blocks * ca->sb.block_size; ++ len -= blocks * ca->sb.block_size; ++ j = ((void *) j) + blocks * block_bytes(ca); ++ } ++ } ++out: ++ c->journal.blocks_free = ++ (ca->sb.bucket_size >> c->block_bits) - used_blocks; ++ ++ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev); ++ ++ bkey_init(k); ++ SET_KEY_PTRS(k, n); ++ ++err: ++ return ret; ++} ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -279,13 +358,23 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + + } + ++ if (c->journal.blocks_free != 0) ++ pr_warn("Unexpected blocks_free %u before reload journal key.\n", ++ c->journal.blocks_free); ++ ++ ret = bch_journal_key_reload(c); ++ + out: + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; + +- return 0; ++ /* Initial value of c->journal.blocks_free should be 0 */ ++ BUG_ON(c->journal.blocks_free != 0); ++ ret = bch_journal_key_reload(c); ++ ++ return ret; + #undef read_bucket + } + +@@ -355,6 +444,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + ++ /* Mark journal replay started */ ++ s->journal.in_replay = true; ++ + list_for_each_entry(i, list, list) { + BUG_ON(i->pin && atomic_read(i->pin) != 1); + +@@ -396,6 +488,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", + keys, entries, end); + err: ++ /* Mark journal replay finished */ ++ s->journal.in_replay = false; ++ + while (!list_empty(list)) { + i = list_first_entry(list, struct journal_replay, list); + list_del(&i->list); +@@ -621,6 +716,18 @@ static void do_journal_discard(struct cache *ca) + } + } + ++static inline bool last_writable_journal_bucket(struct cache_set *c) ++{ ++ struct cache *ca = c->cache; ++ struct journal_device *ja = &ca->journal; ++ ++ if (((ja->cur_idx + 1) % ca->sb.njournal_buckets) != ++ ja->last_idx) ++ return false; ++ ++ return true; ++} ++ + static void journal_reclaim(struct cache_set *c) + { + struct bkey *k = &c->journal.key; +@@ -629,6 +736,8 @@ static void journal_reclaim(struct cache_set *c) + unsigned int next; + struct journal_device *ja = &ca->journal; + atomic_t p __maybe_unused; ++ bool is_last_valid; ++ bool journal_wakeup = true; + + atomic_long_inc(&c->reclaim); + +@@ -646,13 +755,33 @@ static void journal_reclaim(struct cache_set *c) + + do_journal_discard(ca); + +- if (c->journal.blocks_free) ++ is_last_valid = last_writable_journal_bucket(c); ++ ++ /* ++ * This is not the last valid journal bucket, no need to worry ++ * about the reserved journal space. ++ */ ++ if (!is_last_valid && c->journal.blocks_free) ++ goto out; ++ ++ /* ++ * this is the last valid journal bucket, if the free space is ++ * larger than reserved sectors, no need to reclaim more journal ++ * space. Otherwise must try to reclaim one more journal bucket, ++ * to make sure there always are c->journal.reserved sectors ++ * reserved for initialization time usage. ++ */ ++ if (is_last_valid && ++ (c->journal.blocks_free * c->cache->sb.block_size) > ++ c->journal.reserved) + goto out; + + next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + /* No space available on this device */ +- if (next == ja->discard_idx) ++ if (next == ja->discard_idx) { ++ journal_wakeup = false; + goto out; ++ } + + ja->cur_idx = next; + k->ptr[0] = MAKE_PTR(0, +@@ -665,7 +794,7 @@ static void journal_reclaim(struct cache_set *c) + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; + + out: +- if (!journal_full(&c->journal)) ++ if (journal_wakeup) + __closure_wake_up(&c->journal.wait); + } + +@@ -825,6 +954,60 @@ static void journal_try_write(struct cache_set *c) + } + } + ++static bool jset_space_available(struct cache_set *c, size_t sectors) ++{ ++ size_t n, reserved; ++ bool last_writable_bucket; ++ ++ n = min_t(size_t, ++ c->journal.blocks_free * c->cache->sb.block_size, ++ PAGE_SECTORS << JSET_BITS); ++ ++ last_writable_bucket = last_writable_journal_bucket(c); ++ ++ if (!last_writable_bucket || c->journal.in_replay) ++ reserved = 0; ++ else ++ reserved = c->journal.reserved; ++ ++ if (sectors <= (n - reserved)) ++ return true; ++ ++ return false; ++} ++ ++static bool journal_space_available(struct cache_set *c, ++ unsigned int nkeys) ++{ ++ /* ++ * XXX: If we were inserting so many keys that they ++ * won't fit in an _empty_ journal write, we'll ++ * deadlock. For now, handle this in ++ * bch_keylist_realloc() - but something to think about. ++ */ ++ if ((nkeys * sizeof(uint64_t)) > ++ (block_bytes(c->cache) - sizeof(struct jset))) { ++ pr_err("The keys to insert is bigger than an empty journal write.\n"); ++ pr_err("keys in current journal write: %u, keys to insert: %u\n", ++ c->journal.cur->data->keys, nkeys); ++ BUG(); ++ } ++ ++ if (journal_full(&c->journal)) ++ return false; ++ ++ /* ++ * Before flushing current write (without the inserting keys) ++ * to get next empty write, it is still necessary to check ++ * whether there is enough free blocks in current journal bucket ++ * except for the reserved journal space. ++ */ ++ if (jset_space_available(c, 0)) ++ return true; ++ ++ return false; ++} ++ + static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned int nkeys) + __acquires(&c->journal.lock) +@@ -844,28 +1027,27 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(ca)) * ca->sb.block_size; + +- if (sectors <= min_t(size_t, +- c->journal.blocks_free * ca->sb.block_size, +- PAGE_SECTORS << JSET_BITS)) ++ if (jset_space_available(c, sectors)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + +- if (!journal_full(&c->journal)) { +- if (wait) +- trace_bcache_journal_entry_full(c); +- ++ if (journal_space_available(c, nkeys)) { + /* +- * XXX: If we were inserting so many keys that they +- * won't fit in an _empty_ journal write, we'll +- * deadlock. For now, handle this in +- * bch_keylist_realloc() - but something to think about. ++ * Flush current non-empty write and try next ++ * empty one updated by journal_write_unlocked(). + */ +- BUG_ON(!w->data->keys); ++ if (wait) ++ trace_bcache_journal_entry_full(c); + + journal_try_write(c); /* unlocks */ + } else { ++ /* ++ * No space to flush current write, try to reclaim ++ * an empty journal bucket and do all things again ++ * in next loop. ++ */ + if (wait) + trace_bcache_journal_full(c); + +@@ -974,5 +1156,11 @@ int bch_journal_alloc(struct cache_set *c) + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) + return -ENOMEM; + ++ /* deside how many sectors reserved for jouranl replay */ ++ if (JOURANL_RESERVE < c->cache->sb.bucket_size) ++ j->reserved = JOURANL_RESERVE; ++ else ++ j->reserved = c->cache->sb.bucket_size; ++ + return 0; + } +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..bcaa4ce458ae 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -105,6 +105,7 @@ struct journal { + spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; ++ + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; +@@ -119,6 +120,8 @@ struct journal { + BKEY_PADDED(key); + + struct journal_write w[2], *cur; ++ bool in_replay; ++ int reserved; + }; + + /* +@@ -161,6 +164,13 @@ struct journal_device { + #define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + ++/* ++ * Reserve 2 pages space in case journal space is full during ++ * initialization and btree node split happens in journal reply. ++ * If JOURANL_RESERVE > bucket_size, then only reserve 1 bucket. ++ */ ++#define JOURANL_RESERVE (PAGE_SECTORS * 2) ++ + #define JOURNAL_PIN 20000 + + #define journal_full(j) \ +-- +2.34.1 + diff --git a/for-test/jouranl-deadlock/0002-more-fixes.patch b/for-test/jouranl-deadlock/0002-more-fixes.patch new file mode 100644 index 0000000..c51e16b --- /dev/null +++ b/for-test/jouranl-deadlock/0002-more-fixes.patch @@ -0,0 +1,131 @@ +From df1c455f2b0877ca7dbcec7fa06a0aca8ed825d8 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 21 Apr 2022 16:12:53 +0800 +Subject: [PATCH 2/2] more fixes + +--- + Makefile | 2 +- + drivers/md/bcache/journal.c | 15 ++++++++++----- + drivers/md/bcache/request.c | 2 +- + drivers/md/bcache/super.c | 2 ++ + drivers/md/bcache/util.c | 10 ++++++++-- + 5 files changed, 22 insertions(+), 9 deletions(-) + +diff --git a/Makefile b/Makefile +index 29e273d3f8cc..3abbd83b337c 100644 +--- a/Makefile ++++ b/Makefile +@@ -2,7 +2,7 @@ + VERSION = 5 + PATCHLEVEL = 18 + SUBLEVEL = 0 +-EXTRAVERSION = -rc2 ++EXTRAVERSION = -rc2-bcache-journal + NAME = Superb Owl + + # *DOCUMENTATION* +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 5aac20c71b80..916141c69ec8 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -370,9 +370,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + struct journal_replay, + list)->j.seq; + +- /* Initial value of c->journal.blocks_free should be 0 */ +- BUG_ON(c->journal.blocks_free != 0); +- ret = bch_journal_key_reload(c); ++ if (c->journal.blocks_free == 0) { ++ pr_info("c->journal.blocks_free is 0, reload journal_key\n"); ++ ret = bch_journal_key_reload(c); ++ } + + return ret; + #undef read_bucket +@@ -900,12 +901,12 @@ static void journal_write_unlocked(struct closure *cl) + + bio_reset(bio, ca->bdev, REQ_OP_WRITE | + REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); +- bch_bio_map(bio, w->data); + bio->bi_iter.bi_sector = PTR_OFFSET(k, i); + bio->bi_iter.bi_size = sectors << 9; + + bio->bi_end_io = journal_write_endio; + bio->bi_private = w; ++ bch_bio_map(bio, w->data); + + trace_bcache_journal_write(bio, w->data->keys); + bio_list_add(&list, bio); +@@ -1002,9 +1003,12 @@ static bool journal_space_available(struct cache_set *c, + * whether there is enough free blocks in current journal bucket + * except for the reserved journal space. + */ +- if (jset_space_available(c, 0)) ++ if (jset_space_available(c, 0)) { ++ pr_info("there is available jset space\n"); + return true; ++ } + ++ pr_info("NO available jset space\n"); + return false; + } + +@@ -1027,6 +1031,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(ca)) * ca->sb.block_size; + ++ pr_info("sectors from __set_blocks(): %lu\n", sectors); + if (jset_space_available(c, sectors)) + return w; + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index fdd0194f84dd..320fcdfef48e 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s, + { + struct bio *bio = &s->bio.bio; + +- bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO); ++ bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO); + /* + * bi_end_io can be set separately somewhere else, e.g. the + * variants in, +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index bf3de149d3c9..efb9fae4354f 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1077,7 +1077,9 @@ int bch_cached_dev_run(struct cached_dev *dc) + closure_sync(&cl); + } + ++ pr_info("call add_disk(), d->disk: 0x%pK\n", d->disk); + ret = add_disk(d->disk); ++ pr_info("return from add_disk(): %d\n", ret); + if (ret) + goto out; + bd_link_disk_holder(dc->bdev, dc->disk.disk); +diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c +index ae380bc3992e..f3c8b7db43ef 100644 +--- a/drivers/md/bcache/util.c ++++ b/drivers/md/bcache/util.c +@@ -233,8 +233,14 @@ void bch_bio_map(struct bio *bio, void *base) + size_t size = bio->bi_iter.bi_size; + struct bio_vec *bv = bio->bi_io_vec; + +- BUG_ON(!bio->bi_iter.bi_size); +- BUG_ON(bio->bi_vcnt); ++ if (!bio->bi_iter.bi_size) { ++ pr_err("BUG: bio->bi_iter.bi_size is 0\n"); ++ BUG_ON(!bio->bi_iter.bi_size); ++ } ++ if (bio->bi_vcnt) { ++ pr_err("BUG: bio->bi_vcnt: %u\n", bio->bi_vcnt); ++ BUG_ON(bio->bi_vcnt); ++ } + + bv->bv_offset = base ? offset_in_page(base) : 0; + goto start; +-- +2.34.1 + diff --git a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch index cfe5323..cfe5323 100644 --- a/for-test/jouranl-deadlock/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch +++ b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch diff --git a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch index 39b9873..39b9873 100644 --- a/for-test/jouranl-deadlock/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch +++ b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch diff --git a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch index 07050e9..07050e9 100644 --- a/for-test/jouranl-deadlock/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch +++ b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch |