aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-11-12 20:53:57 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-11-12 20:57:28 -0500
commit7fd6c3ffe45b3b42c0bc8a8c5d1387a5e3316a54 (patch)
treed596299da8c34dff74cb13caf9dd47d9154c25b4
parenta613340b26ad88801666362d2824118396f34c38 (diff)
downloadbcachefs-tools-7fd6c3ffe45b3b42c0bc8a8c5d1387a5e3316a54.tar.gz
Update bcachefs sources to 3ca08ab51ec9 bcachefs: six locks: Simplify optimistic spinning
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/list.h11
-rw-r--r--libbcachefs/alloc_background.c30
-rw-r--r--libbcachefs/backpointers.c110
-rw-r--r--libbcachefs/bcachefs.h6
-rw-r--r--libbcachefs/bcachefs_format.h7
-rw-r--r--libbcachefs/bkey_methods.c23
-rw-r--r--libbcachefs/bkey_methods.h2
-rw-r--r--libbcachefs/btree_gc.c10
-rw-r--r--libbcachefs/btree_io.c6
-rw-r--r--libbcachefs/btree_iter.c13
-rw-r--r--libbcachefs/btree_iter.h2
-rw-r--r--libbcachefs/btree_key_cache.c43
-rw-r--r--libbcachefs/btree_key_cache_types.h34
-rw-r--r--libbcachefs/btree_trans_commit.c242
-rw-r--r--libbcachefs/btree_types.h43
-rw-r--r--libbcachefs/btree_update.c27
-rw-r--r--libbcachefs/btree_update.h47
-rw-r--r--libbcachefs/btree_update_interior.c88
-rw-r--r--libbcachefs/btree_update_interior.h3
-rw-r--r--libbcachefs/btree_write_buffer.c63
-rw-r--r--libbcachefs/darray.c21
-rw-r--r--libbcachefs/darray.h28
-rw-r--r--libbcachefs/data_update.c32
-rw-r--r--libbcachefs/dirent.c5
-rw-r--r--libbcachefs/dirent.h3
-rw-r--r--libbcachefs/disk_groups.c4
-rw-r--r--libbcachefs/ec.c58
-rw-r--r--libbcachefs/ec.h2
-rw-r--r--libbcachefs/errcode.h1
-rw-r--r--libbcachefs/fs-io-pagecache.c2
-rw-r--r--libbcachefs/fs-io-pagecache.h2
-rw-r--r--libbcachefs/fs.c14
-rw-r--r--libbcachefs/fsck.c40
-rw-r--r--libbcachefs/inode.c33
-rw-r--r--libbcachefs/io_misc.c12
-rw-r--r--libbcachefs/io_read.c4
-rw-r--r--libbcachefs/io_write.c17
-rw-r--r--libbcachefs/journal.c69
-rw-r--r--libbcachefs/journal.h98
-rw-r--r--libbcachefs/journal_io.c224
-rw-r--r--libbcachefs/journal_reclaim.c110
-rw-r--r--libbcachefs/journal_reclaim.h15
-rw-r--r--libbcachefs/journal_types.h33
-rw-r--r--libbcachefs/logged_ops.c4
-rw-r--r--libbcachefs/lru.c2
-rw-r--r--libbcachefs/migrate.c2
-rw-r--r--libbcachefs/move.c2
-rw-r--r--libbcachefs/movinggc.c1
-rw-r--r--libbcachefs/rebalance.c6
-rw-r--r--libbcachefs/recovery.c104
-rw-r--r--libbcachefs/reflink.c2
-rw-r--r--libbcachefs/sb-clean.c2
-rw-r--r--libbcachefs/sb-errors.c2
-rw-r--r--libbcachefs/sb-members.c2
-rw-r--r--libbcachefs/snapshot.c12
-rw-r--r--libbcachefs/str_hash.h25
-rw-r--r--libbcachefs/subvolume.c6
-rw-r--r--libbcachefs/subvolume_types.h2
-rw-r--r--libbcachefs/super-io.c2
-rw-r--r--libbcachefs/super.c3
-rw-r--r--libbcachefs/trace.h11
-rw-r--r--libbcachefs/util.c140
-rw-r--r--libbcachefs/util.h35
65 files changed, 1034 insertions, 973 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 8abb4edd..bf2104d1 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-d464ec667b2b9de097e39d1505b45aafd87a9552
+3ca08ab51ec996180c20105489176b8c4327240c
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 35a7207e..f9a57129 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -278,4 +278,7 @@ static inline void dump_stack(void) {}
#define unsafe_memcpy(dst, src, bytes, justification) \
memcpy(dst, src, bytes)
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
+ __DECLARE_FLEX_ARRAY(TYPE, NAME)
+
#endif
diff --git a/include/linux/list.h b/include/linux/list.h
index bdd09efa..d176d0d3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n)
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+static inline size_t list_count_nodes(struct list_head *head)
+{
+ struct list_head *pos;
+ size_t count = 0;
+
+ list_for_each(pos, head)
+ count++;
+
+ return count;
+}
+
#endif /* _LIST_LIST_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 1fec0e67..113273b2 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -561,8 +561,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
@@ -581,8 +581,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans);
@@ -1267,7 +1267,7 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
goto out;
}
@@ -1422,8 +1422,8 @@ int bch2_check_alloc_info(struct bch_fs *c)
}
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw);
if (ret)
goto bkey_err;
@@ -1453,7 +1453,7 @@ bkey_err:
for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
bch2_trans_put(trans);
@@ -1546,7 +1546,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_alloc_to_lru_ref(trans, &iter)));
if (ret)
bch_err_fn(c, ret);
@@ -1655,7 +1655,7 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1760,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1884,8 +1884,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1905,8 +1905,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index 5ed96ddd..5025a71a 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -5,6 +5,7 @@
#include "backpointers.h"
#include "btree_cache.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
@@ -220,18 +221,22 @@ out:
static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos,
struct bch_backpointer bp,
- struct bkey_s_c k,
- const char *thing_it_points_to)
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ /*
+ * If we're using the btree write buffer, the backpointer we were
+ * looking at may have already been deleted - failure to find what it
+ * pointed to is not an error:
+ */
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- thing_it_points_to);
+ bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n ");
@@ -257,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bch_backpointer bp,
unsigned iter_flags)
{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
-
- bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
- 0,
- min(bp.level, r->level),
- iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (bp.level == r->level + 1)
- k = bkey_i_to_s_c(&r->key);
-
- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
+ if (likely(!bp.level)) {
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0, 0,
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
- if (unlikely(bch2_backpointers_no_use_write_buffer)) {
- if (bp.level) {
- struct btree *b;
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
- /*
- * If a backpointer for a btree node wasn't found, it may be
- * because it was overwritten by a new btree node that hasn't
- * been written out yet - backpointer_get_node() checks for
- * this:
- */
- b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
- if (!IS_ERR_OR_NULL(b))
- return bkey_i_to_s_c(&b->key);
+ bch2_trans_iter_exit(trans, iter);
+ backpointer_not_found(trans, bp_pos, bp, k);
+ return bkey_s_c_null;
+ } else {
+ struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
-
- if (IS_ERR(b))
- return bkey_s_c_err(PTR_ERR(b));
- return bkey_s_c_null;
+ return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
-
- backpointer_not_found(trans, bp_pos, bp, k, "extent");
+ return bkey_i_to_s_c(&b->key);
}
-
- return bkey_s_c_null;
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@@ -327,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
- if (IS_ERR(b))
+ if (IS_ERR_OR_NULL(b))
goto err;
- if (b && extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ BUG_ON(b->c.level != bp.level - 1);
+
+ if (extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
return b;
- if (b && btree_node_will_make_reachable(b)) {
+ if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
- backpointer_not_found(trans, bp_pos, bp,
- bkey_i_to_s_c(&b->key), "btree node");
+ backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
b = NULL;
}
err:
@@ -395,7 +382,7 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@@ -642,8 +629,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
do {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
check_extent_to_backpointers(trans, &iter,
bucket_start, bucket_end,
&last_flushed));
@@ -657,8 +644,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
break;
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end,
&last_flushed));
@@ -797,7 +784,8 @@ static int check_one_backpointer(struct btree_trans *trans,
if (fsck_err_on(!k.k, c,
backpointer_to_missing_ptr,
- "backpointer for missing extent\n %s",
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out;
@@ -819,7 +807,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 9cb86849..3117ab44 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -401,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
- x(blocked_journal) \
+ x(blocked_journal_low_on_space) \
+ x(blocked_journal_low_on_pin) \
+ x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(nocow_lock_contended)
@@ -617,7 +619,7 @@ struct journal_seq_blacklist_table {
u64 start;
u64 end;
bool dirty;
- } entries[0];
+ } entries[];
};
struct journal_keys {
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 7a1c2440..0a750953 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -2256,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_DATA = BIT(2),
+ BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+ BTREE_ID_DATA = BIT(3),
};
#define BCH_BTREE_IDS() \
@@ -2311,12 +2312,12 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \
+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
- x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 2f518d7e..761f5e33 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -186,15 +186,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree) {
enum btree_id btree = type - 1;
- bkey_fsck_err_on(!btree_type_has_snapshots(btree) &&
- k.k->p.snapshot, c, err,
- bkey_snapshot_nonzero,
- "nonzero snapshot");
-
- bkey_fsck_err_on(btree_type_has_snapshots(btree) &&
- !k.k->p.snapshot, c, err,
- bkey_snapshot_zero,
- "snapshot == 0");
+ if (btree_type_has_snapshots(btree)) {
+ bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+ bkey_snapshot_zero,
+ "snapshot == 0");
+ } else if (!btree_type_has_snapshot_field(btree)) {
+ bkey_fsck_err_on(k.k->p.snapshot, c, err,
+ bkey_snapshot_nonzero,
+ "nonzero snapshot");
+ } else {
+ /*
+ * btree uses snapshot field but it's not required to be
+ * nonzero
+ */
+ }
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
bkey_at_pos_max,
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 3a370b70..912adadf 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -93,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans,
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
@@ -108,7 +107,6 @@ enum btree_update_flags {
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 7cd517ae..c4922bd3 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -1502,7 +1502,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
bch2_alloc_write_key(trans, &iter, k, metadata_only));
if (ret < 0) {
@@ -1659,7 +1659,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_reflink_key(trans, &iter, k, &idx));
c->reflink_gc_nr = 0;
@@ -1783,7 +1783,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k));
bch2_trans_put(trans);
@@ -2019,7 +2019,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
gc_btree_gens_key(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
@@ -2032,7 +2032,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH,
k,
NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_alloc_write_oldest_gen(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 37d896ed..1f73ee0e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1801,9 +1801,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW,
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index ba392eb0..104172f6 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(iter->btree_id));
+ !btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
@@ -1214,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp)
{
- unsigned level = path->level;
-
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref);
@@ -1231,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
@@ -2835,8 +2833,9 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
- WARN(time_after(jiffies, trans->srcu_lock_time + HZ * 10),
- "btree trans held srcu lock (delaying memory reclaim) by more than 10 seconds");
+ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+ (jiffies - trans->srcu_lock_time) / HZ);
}
void bch2_trans_srcu_unlock(struct btree_trans *trans)
@@ -3088,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
kfree(trans->extra_journal_entries.data);
if (trans->fs_usage_deltas) {
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 5e103f51..85e7cb52 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -416,7 +416,7 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(btree_id))
+ !btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 98aeedb7..8e80a5b6 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- if (ck->c.lock.readers)
+ if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
- else
+ bc->nr_freed_pcpu++;
+ } else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ }
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{
struct bkey_cached *pos;
+ bc->nr_freed_nonpcpu++;
+
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
@@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
@@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
@@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
@@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
(ck->journal.seq == journal_last_seq(j)
? BCH_WATERMARK_reclaim
: 0)|
@@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
bch2_journal_pin_drop(j, &ck->journal);
- bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
@@ -728,7 +735,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
ret = commit_do(trans, NULL, NULL, 0,
btree_key_cache_flush_pos(trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false));
+ BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@@ -763,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- int difference;
-
- BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
- difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
- if (difference > 0) {
- trans->journal_preres.u64s -= difference;
- ck->res.u64s += difference;
- }
- }
-
bkey_copy(ck->k, insert);
ck->valid = true;
@@ -852,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
+ scanned += bc->nr_freed_nonpcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -861,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
+ scanned += bc->nr_freed_pcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -877,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_pcpu--;
}
if (scanned >= nr)
@@ -985,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#endif
+ BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+ BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
@@ -994,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
- bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);
diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h
new file mode 100644
index 00000000..cfd09f51
--- /dev/null
+++ b/libbcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
+struct btree_key_cache {
+ struct mutex lock;
+ struct rhashtable table;
+ bool table_init_done;
+
+ struct list_head freed_pcpu;
+ size_t nr_freed_pcpu;
+ struct list_head freed_nonpcpu;
+ size_t nr_freed_nonpcpu;
+
+ struct shrinker shrink;
+ unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+ atomic_long_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
+};
+
+struct bkey_cached_key {
+ u32 btree_id;
+ struct bpos pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 32693f7c..70077efa 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ EBUG_ON(trans->write_locked);
+
+ trans_for_each_update(trans, i) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
+
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trans->write_locked = true;
+ return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ if (likely(trans->write_locked)) {
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
+ trans->write_locked = false;
+ }
+}
+
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -269,23 +316,13 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level &&
+ btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
- unsigned long trace_ip)
-{
- return drop_locks_do(trans,
- bch2_journal_preres_get(&trans->c->journal,
- &trans->journal_preres,
- trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)));
-}
-
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
@@ -320,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0;
}
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned new_u64s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_i *new_k;
+ int ret;
+
+ bch2_trans_unlock_write(trans);
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(path->btree_id), new_u64s);
+ return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ }
+
+ ret = bch2_trans_relock(trans) ?:
+ bch2_trans_lock_write(trans);
+ if (unlikely(ret)) {
+ kfree(new_k);
+ return ret;
+ }
+
+ memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
+ kfree(ck->k);
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ return 0;
+}
+
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s)
{
@@ -333,7 +409,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
- !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ !(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -346,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
- }
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ if (unlikely(!new_k))
+ return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
@@ -583,6 +656,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*stopped_at = i;
return ret;
}
+
+ i->k->k.needs_whiteout = false;
}
if (trans->nr_wb_updates &&
@@ -593,7 +668,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
* Don't get journal reservation until after we know insert will
* succeed:
*/
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
@@ -602,8 +677,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans);
- } else {
- trans->journal_res.seq = c->journal.replay_journal_seq;
}
/*
@@ -612,7 +685,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@@ -626,7 +699,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
return -BCH_ERR_btree_insert_need_mark_replicas;
if (trans->nr_wb_updates) {
- EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+ EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
ret = bch2_btree_insert_keys_write_buffer(trans);
if (ret)
@@ -663,7 +736,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans->journal_res.u64s -= trans->extra_journal_entries.nr;
}
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -705,15 +778,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i) {
- i->k->k.needs_whiteout = false;
-
if (!i->cached) {
- u64 seq = trans->journal_res.seq;
-
- if (i->flags & BTREE_UPDATE_PREJOURNAL)
- seq = i->seq;
-
- bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+ bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
} else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
else {
@@ -731,37 +797,6 @@ revert_fs_usage:
return ret;
}
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
- }
-
- return 0;
-}
-
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@@ -799,6 +834,12 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL;
}
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -829,15 +870,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
}
}
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
- if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
- ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
- if (unlikely(ret))
- return ret;
-
- ret = trans_lock_write(trans);
+ ret = bch2_trans_lock_write(trans);
if (unlikely(ret))
return ret;
@@ -846,20 +879,19 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_trans_unlock_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin, NULL);
+ trans->journal_pin,
+ bch2_trans_commit_journal_pin_flush);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&c->journal, &trans->journal_res);
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
@@ -896,7 +928,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
- if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
@@ -931,7 +963,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_begin(trans);
ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
+ flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@@ -951,8 +983,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- !(flags & BTREE_INSERT_NOWAIT) &&
- (flags & BTREE_INSERT_NOFAIL), c,
+ (flags & BCH_TRANS_COMMIT_no_enospc), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
@@ -964,7 +995,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+ if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
test_bit(BCH_FS_STARTED, &c->flags))
return -BCH_ERR_erofs_trans_commit;
@@ -1002,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
- unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
@@ -1010,9 +1040,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->extra_journal_entries.nr)
goto out_reset;
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
-
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
@@ -1021,7 +1048,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0;
- if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@@ -1039,7 +1066,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
}
- if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
@@ -1052,7 +1079,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_unlock(trans);
ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
+ flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@@ -1062,13 +1089,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
trans->journal_u64s = trans->extra_journal_entries.nr;
- trans->journal_preres_u64s = 0;
-
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@@ -1084,16 +1106,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- /* we're going to journal the key being updated: */
- u64s = jset_u64s(i->k->k.u64s);
- if (i->cached &&
- likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
- trans->journal_preres_u64s += u64s;
-
if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue;
- trans->journal_u64s += u64s;
+ /* we're going to journal the key being updated: */
+ trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
@@ -1106,14 +1123,15 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
- (flags & BTREE_INSERT_NOFAIL)
+ (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
bch2_trans_verify_not_in_restart(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
@@ -1125,9 +1143,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
- if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
@@ -1140,5 +1156,17 @@ err:
if (ret)
goto out;
+ /*
+ * We might have done another transaction commit in the error path -
+ * i.e. btree write buffer flush - which will have made use of
+ * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+ * how the journal sequence number to pin is passed in - so we must
+ * restart:
+ */
+ if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
goto retry;
}
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 4b9cc61a..a3f24cd0 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
@@ -322,31 +322,6 @@ struct btree_iter {
#endif
};
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
-
-struct btree_key_cache {
- struct mutex lock;
- struct rhashtable table;
- bool table_init_done;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct shrinker shrink;
- unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
-
- atomic_long_t nr_freed;
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@@ -362,7 +337,6 @@ struct bkey_cached {
struct rhash_head hash;
struct list_head list;
- struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
@@ -392,7 +366,6 @@ struct btree_insert_entry {
u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
- u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
@@ -441,6 +414,7 @@ struct btree_trans {
bool journal_replay_not_finished:1;
bool is_initial_gc:1;
bool notrace_relock_fail:1;
+ bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
@@ -472,11 +446,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
- struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
- unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
@@ -717,6 +689,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask;
}
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
static inline bool btree_type_has_ptrs(enum btree_id id)
{
const unsigned mask = 0
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 324767c0..1837f848 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -380,21 +380,12 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
- u64 seq = 0;
int cmp;
EBUG_ON(!path->should_be_locked);
EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
EBUG_ON(!bpos_eq(k->k.p, path->pos));
- /*
- * The transaction journal res hasn't been allocated at this point.
- * That occurs at commit time. Reuse the seq field to pass in the seq
- * of a prejournaled key.
- */
- if (flags & BTREE_UPDATE_PREJOURNAL)
- seq = trans->journal_res.seq;
-
n = (struct btree_insert_entry) {
.flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id),
@@ -403,7 +394,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
.cached = path->cached,
.path = path,
.k = k,
- .seq = seq,
.ip_allocated = ip,
};
@@ -431,7 +421,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached;
i->k = n.k;
i->path = n.path;
- i->seq = n.seq;
i->ip_allocated = n.ip_allocated;
} else {
array_insert_item(trans->updates, trans->nr_updates,
@@ -542,18 +531,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
}
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
- struct btree_iter *iter, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- trans->journal_res.seq = seq;
- return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
- BTREE_UPDATE_PREJOURNAL);
-}
-
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
@@ -792,7 +769,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res);
err:
/*
@@ -897,7 +874,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
} else {
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|commit_flags,
+ BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
}
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 9816d228..14a2315a 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -21,37 +21,28 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS() \
+ x(no_enospc, "don't check for enospc") \
+ x(no_check_rw, "don't attempt to take a ref on c->writes") \
+ x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
+ x(no_journal_res, "don't take a journal reservation, instead " \
+ "pin journal entry referred to by trans->journal_res.seq") \
+ x(journal_reclaim, "operation required for journal reclaim; may return error" \
+ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
- __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
- __BTREE_INSERT_NOCHECK_RW,
- __BTREE_INSERT_LAZY_RW,
- __BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RECLAIM,
- __BTREE_INSERT_NOWAIT,
- __BTREE_INSERT_GC_LOCK_HELD,
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
+ __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...) __BCH_TRANS_COMMIT_##n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
};
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 89ada89e..9affcb22 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -475,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
- *
- * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
- * blocking on this lock:
*/
ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret)
@@ -487,9 +484,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT ? NULL : cl,
- interior, flags);
+ b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+ interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -513,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock);
as->took_gc_lock = false;
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
@@ -646,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
@@ -734,8 +728,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
@@ -818,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
@@ -828,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+ bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -937,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr];
}
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
@@ -988,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1042,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
- int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
@@ -1061,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
journal_flags |= watermark;
@@ -1087,9 +1094,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock)) {
+ if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
@@ -1103,7 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
@@ -1129,27 +1134,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret) {
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto err;
- }
-
- ret = drop_locks_do(trans,
- bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags));
- if (ret == -BCH_ERR_journal_preres_get_blocked) {
- trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
- }
- if (ret)
- goto err;
- }
-
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
@@ -1167,7 +1151,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
@@ -1855,7 +1839,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
parent = btree_node_parent(path, b);
as = bch2_btree_update_start(trans, path, level, false,
- BTREE_INSERT_NOFAIL|flags);
+ BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
@@ -1941,7 +1925,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_update *as;
int ret;
- flags |= BTREE_INSERT_NOFAIL;
+ flags |= BCH_TRANS_COMMIT_no_enospc;
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
@@ -2418,23 +2402,17 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
- struct jset_entry *start,
- struct jset_entry *end)
+ struct jset_entry *end,
+ unsigned long skip)
{
- struct jset_entry *entry;
- unsigned long have = 0;
unsigned i;
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root)
- __set_bit(entry->btree_id, &have);
-
mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (r->alive && !test_bit(i, &have)) {
+ if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end);
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index c2ffeb30..031076e7 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level;
struct disk_reservation disk_res;
- struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
@@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
- struct jset_entry *, struct jset_entry *);
+ struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
index 76b6f2dc..a6bf6ed3 100644
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -9,9 +9,11 @@
#include "journal.h"
#include "journal_reclaim.h"
-#include <linux/atomic.h>
#include <linux/sort.h>
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
@@ -46,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
if (ret)
return ret;
+ /*
+ * We can't clone a path that has write locks: unshare it now, before
+ * set_pos and traverse():
+ */
+ if (iter->path->ref > 1)
+ iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
path = iter->path;
if (!*write_locked) {
@@ -65,24 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
-
- if (path->ref > 1) {
- /*
- * We can't clone a path that has write locks: if the path is
- * shared, unlock before set_pos(), traverse():
- */
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- *write_locked = false;
- }
return 0;
trans_commit:
- return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
}
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
@@ -125,9 +128,11 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ trans->journal_res.seq = wb->journal_seq;
+
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_update(trans, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -151,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
+ bch2_btree_write_buffer_journal_flush);
bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb);
@@ -169,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
- * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
@@ -253,21 +259,14 @@ slowpath:
if (!i->journal_seq)
continue;
- if (i->journal_seq > pin.seq) {
- struct journal_entry_pin pin2;
-
- memset(&pin2, 0, sizeof(pin2));
-
- bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin);
- bch2_journal_pin_copy(j, &pin, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin2);
- }
+ bch2_journal_pin_update(j, i->journal_seq, &pin,
+ bch2_btree_write_buffer_journal_flush);
ret = commit_do(trans, NULL, NULL,
commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break;
@@ -297,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
mutex_lock(&wb->flush_lock);
return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+ __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
}
static inline u64 btree_write_buffer_ref(int idx)
diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c
new file mode 100644
index 00000000..aae07be1
--- /dev/null
+++ b/libbcachefs/darray.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include "darray.h"
+
+int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+ if (new_size > d->size) {
+ new_size = roundup_pow_of_two(new_size);
+
+ void *data = krealloc_array(d->data, new_size, element_size, gfp);
+ if (!data)
+ return -ENOMEM;
+
+ d->data = data;
+ d->size = new_size;
+ }
+
+ return 0;
+}
diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h
index 87b4b2d1..43ea21ad 100644
--- a/libbcachefs/darray.h
+++ b/libbcachefs/darray.h
@@ -8,7 +8,6 @@
* Inspired by CCAN's darray
*/
-#include "util.h"
#include <linux/slab.h>
#define DARRAY(type) \
@@ -19,20 +18,25 @@ struct { \
typedef DARRAY(void) darray_void;
-static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_void *d, size_t element_size,
+ size_t new_size, gfp_t gfp)
{
- if (d->nr + more > d->size) {
- size_t new_size = roundup_pow_of_two(d->nr + more);
- void *data = krealloc_array(d->data, new_size, t_size, gfp);
+ return unlikely(new_size > d->size)
+ ? __bch2_darray_resize(d, element_size, new_size, gfp)
+ : 0;
+}
- if (!data)
- return -ENOMEM;
+#define darray_resize_gfp(_d, _new_size, _gfp) \
+ __darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
- d->data = data;
- d->size = new_size;
- }
+#define darray_resize(_d, _new_size) \
+ darray_resize_gfp(_d, _new_size, GFP_KERNEL)
- return 0;
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+ return __darray_resize(d, t_size, d->nr + more, gfp);
}
#define darray_make_room_gfp(_d, _more, _gfp) \
@@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
+#define darray_room(_d) ((_d).size - (_d).nr)
+
#define darray_top(_d) ((_d).data[(_d).nr])
#define darray_push_gfp(_d, _item, _gfp) \
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 0771a6d8..55769d77 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -239,6 +239,34 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
@@ -250,8 +278,8 @@ restart_drop_extra_replicas:
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 1a0f2d57..0542d994 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -201,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset, int flags)
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -212,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
+ dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index cd262bf4..8a552455 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, int);
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index d613695a..4d0cb0cc 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
case TARGET_DEV: {
struct bch_dev *ca;
+ out->atomic++;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
@@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
rcu_read_unlock();
+ out->atomic--;
break;
}
case TARGET_GROUP:
@@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
}
-void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 5da0e7a6..c730f093 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -150,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+ prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
@@ -303,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf buf2 = PRINTBUF;
+ struct printbuf err = PRINTBUF;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+ prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+ want.hi, want.lo,
+ got.hi, got.lo,
+ bch2_csum_types[v->csum_type]);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
-
- bch_err_ratelimited(c,
- "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
- (void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2.buf);
- printbuf_exit(&buf2);
clear_bit(i, buf->valid);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break;
}
@@ -475,14 +481,10 @@ err:
return ret;
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
- return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
-}
-
/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{
+ struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct closure cl;
struct bch_stripe *v;
@@ -497,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent;
- ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+ ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);
@@ -801,7 +803,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
if (ret) {
bch_err_fn(c, ret);
@@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
@@ -1119,8 +1121,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = bch2_trans_do(c, &s->res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
@@ -1371,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
+
+ /*
+ * If we only have redundancy + 1 devices, we're better off with just
+ * replication:
+ */
+ if (h->nr_active_devs < h->redundancy + 2)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+ h->nr_active_devs, h->redundancy + 2);
+
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@@ -1422,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
+ if (!IS_ERR_OR_NULL(h) &&
+ h->nr_active_devs < h->redundancy + 2) {
+ mutex_unlock(&h->lock);
+ h = NULL;
+ }
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1679,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
- if (!h)
- bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 61c67aa0..7d0237c9 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -199,7 +199,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 68a1a96b..e5c3262c 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
index 8bd9bcdd..ff664fd0 100644
--- a/libbcachefs/fs-io-pagecache.c
+++ b/libbcachefs/fs-io-pagecache.c
@@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end,
- int fgp_flags, gfp_t gfp,
+ fgf_t fgp_flags, gfp_t gfp,
folios *fs)
{
struct folio *f;
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
index a2222ad5..27f712ae 100644
--- a/libbcachefs/fs-io-pagecache.h
+++ b/libbcachefs/fs-io-pagecache.h
@@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
- u64, int, gfp_t, folios *);
+ u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/*
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 8dbc848f..f76d403c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -452,7 +452,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
@@ -717,7 +717,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1922,10 +1922,7 @@ out:
return dget(sb->s_root);
err_put_super:
- sb->s_fs_info = NULL;
- c->vfs_sb = NULL;
deactivate_locked_super(sb);
- bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret));
}
@@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
{
struct bch_fs *c = sb->s_fs_info;
- if (c)
- c->vfs_sb = NULL;
generic_shutdown_super(sb);
- if (c)
- bch2_fs_free(c);
+ bch2_fs_free(c);
}
static struct file_system_type bcache_fs_type = {
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9f3e9bd3..127310d3 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -208,8 +208,8 @@ static int fsck_write_inode(struct btree_trans *trans,
u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
__write_inode(trans, inode, snapshot));
if (ret)
bch_err_fn(trans->c, ret);
@@ -354,8 +354,8 @@ static int reattach_inode(struct btree_trans *trans,
u32 inode_snapshot)
{
int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
__reattach_inode(trans, inode, inode_snapshot));
bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
return ret;
@@ -757,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw);
}
static int hash_check_key(struct btree_trans *trans,
@@ -992,7 +992,7 @@ int bch2_check_inodes(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_inode(trans, &iter, k, &prev, &s, full));
snapshots_seen_exit(&s);
@@ -1226,7 +1226,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
@@ -1465,7 +1465,7 @@ int bch2_check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
@@ -1494,7 +1494,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
POS_MIN,
BTREE_ITER_PREFETCH, k,
&res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k);
}));
@@ -1854,7 +1854,7 @@ int bch2_check_dirents(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
bch2_trans_put(trans);
@@ -1918,7 +1918,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret);
return ret;
@@ -1949,8 +1949,8 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
&root_subvol.k_i, 0));
bch_err_msg(c, ret, "writing root subvol");
@@ -1986,8 +1986,8 @@ int bch2_check_root(struct bch_fs *c)
int ret;
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2116,8 +2116,8 @@ static int check_path(struct btree_trans *trans,
return 0;
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
@@ -2398,7 +2398,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2483,7 +2483,7 @@ int bch2_fix_reflink_p(struct bch_fs *c)
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index ef58e292..1baf8b7f 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -830,7 +830,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
@@ -893,7 +893,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1057,7 +1057,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1091,7 +1091,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
ret = bch2_inode_unpack(k, &inode);
if (ret)
- goto err;
+ goto out;
if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
deleted_inode_is_dir,
@@ -1109,38 +1109,45 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
!fsck_err(c,
deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot))
- return 0;
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
struct bpos new_min_pos;
ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
if (ret)
- goto err;
+ goto out;
inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw);
bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret)
- return ret;
+ goto out;
/*
* We'll need another write buffer flush to pick up the new
* unlinked inodes in the snapshot leaves:
*/
*need_another_pass = true;
- return 0;
+ goto out;
}
- return 1;
-err:
+ ret = 1;
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ goto out;
}
int bch2_delete_dead_inodes(struct bch_fs *c)
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
index bebc1144..eab0c8c5 100644
--- a/libbcachefs/io_misc.c
+++ b/libbcachefs/io_misc.c
@@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size));
if (ret)
goto err;
@@ -378,7 +378,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
@@ -390,7 +390,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i));
}
@@ -455,7 +455,7 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_disk_reservation_put(c, &disk_res);
@@ -470,12 +470,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i));
}
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index ae36fc48..b833409c 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -526,7 +526,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio));
}
@@ -1025,7 +1025,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(c, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index fbfc42ff..97f7a4b7 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i *k;
struct bkey_i_inode_v3 *inode;
+ /*
+ * Crazy performance optimization:
+ * Every extent update needs to also update the inode: the inode trigger
+ * will set bi->journal_seq to the journal sequence number of this
+ * transaction - for fsync.
+ *
+ * But if that's the only reason we're updating the inode (we're not
+ * updating bi_size or bi_sectors), then we don't need the inode update
+ * to be journalled - if we crash, the bi_journal_seq update will be
+ * lost, but that's fine.
+ */
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret;
@@ -305,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret))
return ret;
@@ -1165,7 +1176,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 5b5d69f2..7d448136 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -361,11 +361,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- if (j->res_get_blocked_start)
- bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -465,15 +460,12 @@ retry:
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
ret = journal_entry_open(j);
- if (ret == JOURNAL_ERR_max_in_flight)
+ if (ret == JOURNAL_ERR_max_in_flight) {
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, true);
trace_and_count(c, journal_entry_full, c);
-unlock:
- if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
- !j->res_get_blocked_start) {
- j->res_get_blocked_start = local_clock() ?: 1;
- trace_and_count(c, journal_full, c);
}
-
+unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -526,36 +518,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
}
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
- if (!ret && mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
-
- return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- int ret;
-
- closure_wait_event(&j->preres_wait,
- (ret = bch2_journal_error(j)) ||
- journal_preres_available(j, res, new_u64s, flags));
- return ret;
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
@@ -1290,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
+ u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
u64 seq;
unsigned i;
@@ -1303,21 +1266,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "average write size:\t");
+ prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+ prt_newline(out);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 011711e9..c85d01cf 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -395,104 +395,6 @@ out:
return 0;
}
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
- union journal_preres_state s = READ_ONCE(j->prereserved);
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (fifo_free(&j->pin) < j->pin.size / 8)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (s.reserved > s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (!s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
- struct journal_preres *res)
-{
- union journal_preres_state s = { .reserved = res->u64s };
-
- if (!res->u64s)
- return;
-
- s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
- res->u64s = 0;
-
- if (unlikely(s.waiting)) {
- clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
- (unsigned long *) &j->prereserved.v);
- closure_wake_up(&j->preres_wait);
- }
-
- if (s.reserved <= s.remaining && j->watermark)
- journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
- struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags,
- bool set_waiting)
-{
- int d = new_u64s - res->u64s;
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret;
-
- do {
- old.v = new.v = v;
- ret = 0;
-
- if (watermark == BCH_WATERMARK_reclaim ||
- new.reserved + d < new.remaining) {
- new.reserved += d;
- ret = 1;
- } else if (set_waiting && !new.waiting)
- new.waiting = true;
- else
- return 0;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
-
- if (ret)
- res->u64s += d;
- return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- if (new_u64s <= res->u64s)
- return 0;
-
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
- return 0;
-
- if (flags & JOURNAL_RES_GET_NONBLOCK)
- return -BCH_ERR_journal_preres_get_blocked;
-
- return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 65878542..109c1157 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1079,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+ /*
+ * Debug code for ZNS support, where we (probably) want to be
+ * correlated where we stopped in the journal to the zone write
+ * points:
+ */
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
@@ -1086,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
+#endif
ja->sectors_free = 0;
}
@@ -1585,6 +1592,9 @@ static void journal_write_done(struct closure *cl)
bch2_journal_space_available(j);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, false);
+
closure_wake_up(&w->wait);
journal_wake(j);
@@ -1678,9 +1688,15 @@ static void do_journal_write(struct closure *cl)
continue_at(cl, journal_write_done, c->io_complete_wq);
}
-static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
- struct jset_entry *i, *next, *prev = NULL;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset *jset = w->data;
+ unsigned sectors, bytes, u64s;
+ bool validate_before_checksum = false;
+ unsigned long btree_roots_have = 0;
+ int ret;
/*
* Simple compaction, dropping empty jset_entries (from journal
@@ -1697,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
if (!u64s)
continue;
- if (i->type == BCH_JSET_ENTRY_btree_root)
+ /*
+ * New btree roots are set by journalling them; when the journal
+ * entry gets written we have to propagate them to
+ * c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the
+ * btree roots (at least for now); so after we copy btree roots
+ * to c->btree_roots we have to get any missing btree roots and
+ * add them to this journal entry:
+ */
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i);
+ __set_bit(i->btree_id, &btree_roots_have);
+ }
/* Can we merge with previous entry? */
if (prev &&
@@ -1722,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-void bch2_journal_write(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_replicas_padded replicas;
- struct jset_entry *start, *end;
- struct jset *jset;
- struct bio *bio;
- struct printbuf journal_debug_buf = PRINTBUF;
- bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
- int ret;
-
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
- journal_buf_realloc(j, w);
- jset = w->data;
-
- j->write_start_time = local_clock();
-
- spin_lock(&j->lock);
-
- /*
- * If the journal is in an error state - we did an emergency shutdown -
- * we prefer to continue doing journal writes. We just mark them as
- * noflush so they'll never be used, but they'll still be visible by the
- * list_journal tool - this helps in debugging.
- *
- * There's a caveat: the first journal write after marking the
- * superblock dirty must always be a flush write, because on startup
- * from a clean shutdown we didn't necessarily read the journal and the
- * new journal write might overwrite whatever was in the journal
- * previously - we can't leave the journal without any flush writes in
- * it.
- *
- * So if we're in an error state, and we're still starting up, we don't
- * write anything at all.
- */
- if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
- (bch2_journal_error(j) ||
- w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
- w->noflush = true;
- SET_JSET_NO_FLUSH(jset, true);
- jset->last_seq = 0;
- w->last_seq = 0;
-
- j->nr_noflush_writes++;
- } else if (!bch2_journal_error(j)) {
- j->last_flush_write = jiffies;
- j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
- } else {
- spin_unlock(&j->lock);
- goto err;
- }
- spin_unlock(&j->lock);
-
- /*
- * New btree roots are set by journalling them; when the journal entry
- * gets written we have to propagate them to c->btree_roots
- *
- * But, every journal entry we write has to contain all the btree roots
- * (at least for now); so after we copy btree roots to c->btree_roots we
- * have to get any missing btree roots and add them to this journal
- * entry:
- */
-
- bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
- end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+ end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
@@ -1816,7 +1769,7 @@ void bch2_journal_write(struct closure *cl)
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
- goto err;
+ return -EINVAL;
}
jset->magic = cpu_to_le64(jset_magic(c));
@@ -1835,37 +1788,119 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret))
- goto err;
+ return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+ return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int error = bch2_journal_error(j);
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ return -EIO;
+
+ if (error ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(w->data, true);
+ w->data->last_seq = 0;
+ w->last_seq = 0;
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ }
+
+ return 0;
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
+ struct bio *bio;
+ struct printbuf journal_debug_buf = PRINTBUF;
+ unsigned i, nr_rw_members = 0;
+ int ret;
+
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+ j->write_start_time = local_clock();
-retry_alloc:
spin_lock(&j->lock);
- ret = journal_write_alloc(j, w);
+ ret = bch2_journal_write_pick_flush(j, w);
+ spin_unlock(&j->lock);
+ if (ret)
+ goto err;
+
+ journal_buf_realloc(j, w);
+
+ ret = bch2_journal_write_prep(j, w);
+ if (ret)
+ goto err;
+
+ j->entry_bytes_written += vstruct_bytes(w->data);
+
+ while (1) {
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w);
+ if (!ret || !j->can_discard)
+ break;
- if (ret && j->can_discard) {
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
- goto retry_alloc;
}
- if (ret)
+ if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j);
+ spin_unlock(&j->lock);
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
+ goto err;
+ }
/*
* write is allocated, no longer need to account for it in
@@ -1880,13 +1915,6 @@ retry_alloc:
bch2_journal_space_available(j);
spin_unlock(&j->lock);
- if (ret) {
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
- goto err;
- }
-
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
@@ -1908,7 +1936,7 @@ retry_alloc:
if (ret)
goto err;
- if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 9a584aaa..8fa05bed 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -50,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j)
{
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
-
- do {
- old.v = new.v = v;
- new.remaining = u64s_remaining;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool low_on_space = j->space[journal_space_clean].total * 4 <=
+ j->space[journal_space_total].total;
+ bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+ unsigned watermark = low_on_space || low_on_pin
+ ? BCH_WATERMARK_reclaim
+ : BCH_WATERMARK_stripe;
+
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+ &j->low_on_space_start, low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+ &j->low_on_pin_start, low_on_pin))
+ trace_and_count(c, journal_full, c);
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static struct journal_space
@@ -162,7 +171,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
- s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +230,10 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- u64s_remaining = (u64) clean << 6;
- u64s_remaining -= (u64) total << 3;
- u64s_remaining = max(0LL, u64s_remaining);
- u64s_remaining /= 4;
- u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+ journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
- journal_set_remaining(j, u64s_remaining);
- journal_set_watermark(j);
if (!ret)
journal_wake(j);
@@ -369,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other;
}
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ journal_pin_flush_fn flush_fn,
+ enum journal_pin_type type)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ /*
+ * flush_fn is how we identify journal pins in debugfs, so must always
+ * exist, even if it doesn't do anything:
+ */
+ BUG_ON(!flush_fn);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
+ u64 seq = READ_ONCE(src->seq);
+
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -389,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return;
}
- pin_list = journal_seq_pin(j, seq);
+ reclaim = __journal_pin_drop(j, dst);
- reclaim = __journal_pin_drop(j, pin);
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
- if (flush_fn)
- list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
- else
- list_add(&pin->list, &pin_list->flushed);
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ BUG_ON(seq < journal_last_seq(j));
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -555,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- /* And include pre-reservations: */
- nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
- (ca->mi.bucket_size << 6) -
- journal_entry_overhead(j));
-
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
- if (j->prereserved.reserved * 4 > j->prereserved.remaining)
- min_nr = 1;
-
- if (fifo_free(&j->pin) <= 32)
+ if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- j->prereserved.reserved,
- j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),
@@ -805,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
+ /* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index 494d1a6e..7b15d682 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -47,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-static inline void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- /* Guard against racing with journal_pin_drop(src): */
- u64 seq = READ_ONCE(src->seq);
-
- if (seq)
- bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 42504e16..2427cce6 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -76,14 +76,6 @@ struct journal_res {
u64 seq;
};
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
- unsigned u64s;
-};
-
union journal_res_state {
struct {
atomic64_t counter;
@@ -104,22 +96,6 @@ union journal_res_state {
};
};
-union journal_preres_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 waiting:1,
- reserved:31,
- remaining:32;
- };
-};
-
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations;
enum bch_watermark watermark;
- union journal_preres_state prereserved;
-
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;
@@ -288,15 +262,18 @@ struct journal {
unsigned long last_flush_write;
- u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
+ u64 entry_bytes_written;
+
+ u64 low_on_space_start;
+ u64 low_on_pin_start;
+ u64 max_in_flight_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
index 8640f7de..9a76a9aa 100644
--- a/libbcachefs/logged_ops.c
+++ b/libbcachefs/logged_ops.c
@@ -85,13 +85,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
- return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k));
}
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
- int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/*
* This needs to be a fatal error because we've left an unfinished
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
index a5cc0ed1..e6d081c0 100644
--- a/libbcachefs/lru.c
+++ b/libbcachefs/lru.c
@@ -155,7 +155,7 @@ int bch2_check_lrus(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
if (ret)
bch_err_fn(c, ret);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index e3a51f6d..8e5688d0 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -90,7 +90,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index ab749bf2..4f7d1758 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -263,7 +263,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_move_extent(struct moving_context *ctxt,
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 0158c7aa..0a057632 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -370,6 +370,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
+ bch2_trans_unlock_long(ctxt.trans);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 3319190b..db2139c0 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
@@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -273,7 +273,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 9c30500c..130274b1 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -98,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
+ if (k->overwritten)
+ return 0;
+
+ trans->journal_res.seq = k->journal_seq;
+
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
@@ -139,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key **keys_sorted, *k;
+ DARRAY(struct journal_key *) keys_sorted = { 0 };
+ struct journal_key **kp;
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- size_t i;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
- if (!keys_sorted)
- return -BCH_ERR_ENOMEM_journal_replay;
-
- for (i = 0; i < keys->nr; i++)
- keys_sorted[i] = &keys->d[i];
-
- sort(keys_sorted, keys->nr,
- sizeof(keys_sorted[0]),
- journal_sort_seq_cmp, NULL);
-
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
@@ -167,27 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
goto err;
}
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
+ /*
+ * First, attempt to replay keys in sorted order. This is more
+ * efficient, but some might fail if that would cause a journal
+ * deadlock.
+ */
+ for (size_t i = 0; i < keys->nr; i++) {
+ cond_resched();
+
+ struct journal_key *k = keys->d + i;
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ bch2_journal_replay_key(trans, k));
+ BUG_ON(!ret && !k->overwritten);
+ if (ret) {
+ ret = darray_push(&keys_sorted, k);
+ if (ret)
+ goto err;
+ }
+ }
+ /*
+ * Now, replay any remaining keys in the order in which they appear in
+ * the journal, unpinning those journal entries as we go:
+ */
+ sort(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
+
+ darray_for_each(keys_sorted, kp) {
cond_resched();
+ struct journal_key *k = *kp;
+
replay_now_at(j, k->journal_seq);
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- (!k->allocated
- ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
- : 0),
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ (!k->allocated
+ ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+ : 0),
bch2_journal_replay_key(trans, k));
- if (ret) {
- bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+ bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+ bch2_btree_id_str(k->btree_id), k->level);
+ if (ret)
goto err;
- }
+
+ BUG_ON(!k->overwritten);
}
+ bch2_trans_put(trans);
+ trans = NULL;
+
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
@@ -198,10 +224,10 @@ static int bch2_journal_replay(struct bch_fs *c)
if (keys->nr && !ret)
bch2_journal_log_msg(c, "journal replay finished");
err:
- kvfree(keys_sorted);
-
- if (ret)
- bch_err_fn(c, ret);
+ if (trans)
+ bch2_trans_put(trans);
+ darray_exit(&keys_sorted);
+ bch_err_fn(c, ret);
return ret;
}
@@ -468,7 +494,7 @@ err:
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans));
if (ret)
bch_err_fn(c, ret);
@@ -489,7 +515,19 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c)
{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ if (keys->nr)
+ return bch2_fs_read_write_early(c);
return 0;
}
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 6e1bfe9f..07ddf3e8 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -390,7 +390,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
}
bch2_trans_iter_exit(trans, &inode_iter);
diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c
index 9b6cc86d..e151ada1 100644
--- a/libbcachefs/sb-clean.c
+++ b/libbcachefs/sb-clean.c
@@ -376,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
bch2_journal_super_entries_add_common(c, &entry, 0);
- entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+ entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,
diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c
index 9215d414..f0930ab7 100644
--- a/libbcachefs/sb-errors.c
+++ b/libbcachefs/sb-errors.c
@@ -70,7 +70,7 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
prt_tab(out);
- bch2_prt_date_seconds(out, le64_to_cpu(e->entries[i].last_error_time));
+ bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
prt_newline(out);
}
}
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
index 6a7e20de..bed0f857 100644
--- a/libbcachefs/sb-members.c
+++ b/libbcachefs/sb-members.c
@@ -230,7 +230,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Last mount:");
prt_tab(out);
if (m.last_mount)
- bch2_prt_date_seconds(out, le64_to_cpu(m.last_mount));
+ bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index e9af77b3..b23550b4 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -590,7 +590,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
if (ret)
@@ -868,7 +868,7 @@ int bch2_check_snapshots(struct bch_fs *c)
for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
parent_id, id))
goto err;
- parent->v.children[i] = le32_to_cpu(child_id);
+ parent->v.children[i] = cpu_to_le32(child_id);
normalize_snapshot_child_pointers(&parent->v);
}
@@ -1449,12 +1449,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res);
@@ -1489,7 +1489,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
goto err_create_lock;
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index ae21a8cc..89fdb7c2 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -15,6 +15,16 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- int flags,
+ bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
@@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
}
if (!slot.path &&
- !(flags & BCH_HASH_SET_MUST_REPLACE))
+ !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +297,16 @@ found:
found = true;
not_found:
- if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, 0);
+ ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
@@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
- struct bkey_i *insert, int flags)
+ struct bkey_i *insert,
+ bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
@@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, flags, 0);
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index fccd25aa..1cbf9e3a 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -89,7 +89,7 @@ int bch2_check_subvols(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@@ -219,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
@@ -256,7 +256,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
index 86833445..2d2e66a4 100644
--- a/libbcachefs/subvolume_types.h
+++ b/libbcachefs/subvolume_types.h
@@ -20,7 +20,7 @@ struct snapshot_t {
};
struct snapshot_table {
- struct snapshot_t s[0];
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 9b9f36af..f4cad903 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -1183,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Created:");
prt_tab(out);
if (sb->time_base_lo)
- bch2_prt_date_seconds(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 24672bb3..bb945108 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -641,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
@@ -750,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 893304a1..78576711 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
- u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
- __field(u64, prereserved )
- __field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
- __entry->prereserved = prereserved;
- __entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
- __entry->prereserved,
- __entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 7ba5df4e..2ff9cdfb 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -315,6 +315,57 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
return ret;
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -359,6 +410,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
@@ -372,20 +424,24 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
}
}
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ for (struct bch2_time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
- struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
+ __bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
@@ -423,40 +479,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
-#endif
-
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@@ -467,26 +489,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-#define TABSTOP_SIZE 12
-
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
@@ -495,12 +497,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out);
}
+#define TABSTOP_SIZE 12
+
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
/*
* avoid divide by zero
*/
@@ -546,6 +560,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
+ pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
@@ -603,6 +618,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q;
}
}
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 0595605e..54e309d9 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -244,7 +244,7 @@ do { \
#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_date_seconds(struct printbuf *, time64_t);
+void bch2_prt_datetime(struct printbuf *, time64_t);
#ifdef __KERNEL__
static inline void uuid_unparse_lower(u8 *uuid, char *out)
@@ -372,8 +372,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
- u64 max_duration;
u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
@@ -388,15 +389,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ if (v != !!*start) {
+ if (!v) {
+ bch2_time_stats_update(stats, *start);
+ *start = 0;
+ } else {
+ *start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ bool ret = v && !*start;
+ *start = v;
+ return ret;
+}
+#endif
+
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);