aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOGAWA Hirofumi <hirofumi@mail.parknet.co.jp>2014-03-22 04:54:05 +0900
committerDaniel Phillips <daniel@tux3.org>2014-03-22 04:54:05 +0900
commit9b8da2c37b75acb68de614ce9a0340303db92226 (patch)
tree2fa4c9e116295dd6626e81a2ef84ba202645dd02
parent6c7a4b82b346010288d15f9addf11d7116d777e3 (diff)
downloadlinux-tux3-9b8da2c37b75acb68de614ce9a0340303db92226.tar.gz
tux3: Optimize deferred inums search
For now, we are using inode hash to find, deferred allocated inums. But, loop of find_inode() consumes cpu and slow on some situations. To fix, this replaces inode hash with simple deferred inums bitmap. struct tux3_idefer_map is hash table. struct tux3_idefer_node is bitmap for BITMAP_SIZE bits. "node" has index and count, and hashed into "map". With this simple bitmap, we can find free inum (skip deferred inums) by find_next_zero_bit(). And dbench become about from 50MB/s to 400 MB/s. (For now, it seems to be fast) Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
-rw-r--r--fs/tux3/commit.c23
-rw-r--r--fs/tux3/inode.c82
-rw-r--r--fs/tux3/inode_defer.c218
-rw-r--r--fs/tux3/super.c13
-rw-r--r--fs/tux3/tux3.h8
5 files changed, 284 insertions, 60 deletions
diff --git a/fs/tux3/commit.c b/fs/tux3/commit.c
index b64e61782bad2..f54a0fc33194e 100644
--- a/fs/tux3/commit.c
+++ b/fs/tux3/commit.c
@@ -26,7 +26,7 @@ static void schedule_flush_delta(struct sb *sb);
#define ALLOW_FRONTEND_MODIFY
/* Initialize the lock and list */
-static void init_sb(struct sb *sb)
+static int init_sb(struct sb *sb)
{
int i;
@@ -53,6 +53,12 @@ static void init_sb(struct sb *sb)
/* Initialize sb_delta_dirty */
for (i = 0; i < ARRAY_SIZE(sb->s_ddc); i++)
INIT_LIST_HEAD(&sb->s_ddc[i].dirty_inodes);
+
+ sb->idefer_map = tux3_alloc_idefer_map();
+ if (!sb->idefer_map)
+ return -ENOMEM;
+
+ return 0;
}
static void setup_roots(struct sb *sb, struct disksuper *super)
@@ -115,10 +121,17 @@ static void __setup_sb(struct sb *sb, struct disksuper *super)
}
/* Initialize and setup sb by on-disk super block */
-void setup_sb(struct sb *sb, struct disksuper *super)
+int setup_sb(struct sb *sb, struct disksuper *super)
{
- init_sb(sb);
+ int err;
+
+ err = init_sb(sb);
+ if (err)
+ return err;
+
__setup_sb(sb, super);
+
+ return 0;
}
/* Load on-disk super block, and call setup_sb() with it */
@@ -128,7 +141,9 @@ int load_sb(struct sb *sb)
int err;
/* At least initialize sb, even if load is failed */
- init_sb(sb);
+ err = init_sb(sb);
+ if (err)
+ return err;
err = devio(READ, sb_dev(sb), SB_LOC, super, SB_LEN);
if (err)
diff --git a/fs/tux3/inode.c b/fs/tux3/inode.c
index 70175efbf8a9e..3e3f9d1725330 100644
--- a/fs/tux3/inode.c
+++ b/fs/tux3/inode.c
@@ -90,6 +90,9 @@ struct inode *tux_new_inode(struct inode *dir, struct tux_iattr *iattr,
/*
* Deferred ileaf update for inode number allocation
*/
+
+#include "inode_defer.c"
+
/* must hold itree->btree.lock */
static int is_defer_alloc_inum(struct inode *inode)
{
@@ -97,41 +100,25 @@ static int is_defer_alloc_inum(struct inode *inode)
}
/* must hold itree->btree.lock */
-static int find_defer_alloc_inum(struct sb *sb, inum_t inum)
-{
- /*
- * FIXME: temporary hack. We should replace this by efficient
- * one something like bitmap.
- */
-#if 0
- struct tux3_inode *tuxnode;
-
- list_for_each_entry(tuxnode, &sb->alloc_inodes, alloc_list) {
- if (tuxnode->inum == inum)
- return 1;
- }
-#else
- struct inode *tmp = tux3_ilookup_nowait(sb, inum);
- if (tmp) {
- iput(tmp);
- return 1;
- }
-#endif
- return 0;
-}
-
-/* must hold itree->btree.lock */
-static void add_defer_alloc_inum(struct inode *inode)
+static int add_defer_alloc_inum(struct inode *inode)
{
/* FIXME: need to reserve space (ileaf/bnodes) for this inode? */
struct sb *sb = tux_sb(inode->i_sb);
- list_add_tail(&tux_inode(inode)->alloc_list, &sb->alloc_inodes);
+ struct tux3_inode *tuxnode = tux_inode(inode);
+ int err = tux3_idefer_add(sb->idefer_map, tuxnode->inum);
+ if (!err)
+ list_add_tail(&tuxnode->alloc_list, &sb->alloc_inodes);
+ return err;
}
/* must hold itree->btree.lock. FIXME: spinlock is enough? */
void del_defer_alloc_inum(struct inode *inode)
{
- list_del_init(&tux_inode(inode)->alloc_list);
+ struct sb *sb = tux_sb(inode->i_sb);
+ struct tux3_inode *tuxnode = tux_inode(inode);
+ if (!list_empty(&tuxnode->alloc_list))
+ tux3_idefer_del(sb->idefer_map, tuxnode->inum);
+ list_del_init(&tuxnode->alloc_list);
}
void cancel_defer_alloc_inum(struct inode *inode)
@@ -246,28 +233,18 @@ static int alloc_inum(struct inode *inode, inum_t goal)
down_write(&cursor->btree->lock);
while (1) {
+ inum_t orig;
+
err = find_free_inum(cursor, goal, &goal);
if (err)
goto error;
- /*
- * Is this inum already used by deferred inum allocation?
- *
- * FIXME: Can be nfsd race happened, or fs corruption.
- * And we would want to move this outside btree->lock.
- */
- if (insert_inode_locked4(inode, goal, tux_test, &goal) >= 0)
+ /* Skip deferred allocate inums */
+ orig = goal;
+ goal = tux3_idefer_find_free(sb->idefer_map, orig);
+ if (orig == goal)
break;
-
- /*
- * Skip deferred allocate inums.
- *
- * FIXME: This is inefficient, we should replace with
- * better way.
- */
- goal++;
- while (find_defer_alloc_inum(sb, goal))
- goal++;
+ /* If goal marked as deferred inums, retry from itree */
}
init_btree(&tux_inode(inode)->btree, sb, no_root, dtree_ops());
@@ -276,7 +253,9 @@ static int alloc_inum(struct inode *inode, inum_t goal)
tux_set_inum(inode, goal);
tux_setup_inode(inode);
- add_defer_alloc_inum(inode);
+ err = add_defer_alloc_inum(inode);
+ if (err)
+ goto error;
/*
* If inum is not reserved area, account it. If inum is
@@ -316,25 +295,22 @@ static void tux_assign_inum_failed(struct inode *inode)
int tux_assign_inum(struct inode *inode, inum_t goal)
{
+ inum_t inum;
int err;
err = alloc_inum(inode, goal);
if (err)
goto error;
-#if 0
- /*
- * FIXME: temporary hack. We shouldn't insert inode to hash
- * in alloc_inum before initializing completely.
- */
- inum_t inum = tux_inode(inode)->inum;
+
+ inum = tux_inode(inode)->inum;
if (insert_inode_locked4(inode, inum, tux_test, &inum) < 0) {
/* Can be nfsd race happened, or fs corruption. */
- tux3_warn(tux_sb(dir->i_sb), "inode insert error: inum %Lx",
+ tux3_warn(tux_sb(inode->i_sb), "inode insert error: inum %Lx",
inum);
err = -EIO;
goto error;
}
-#endif
+
/* The inode was hashed, we can use deferred deletion from here */
/*
diff --git a/fs/tux3/inode_defer.c b/fs/tux3/inode_defer.c
new file mode 100644
index 0000000000000..2acf9f187c39e
--- /dev/null
+++ b/fs/tux3/inode_defer.c
@@ -0,0 +1,218 @@
+/*
+ * Deferred inum allocation management.
+ *
+ * This uses bitmap to find current in-flight deferred inum
+ * allocation. For each the node represent BITMAP_SIZE bits, and the
+ * node is hashed on struct tux3_idefer_map.
+ */
+
+/* Bitmap node hash size */
+#define NODE_HASH_BITS 8
+#define NODE_HASH_SIZE (1 << NODE_HASH_BITS)
+
+/* Bitmap size in byte and bit */
+#define BITMAP_BYTE_SHIFT 6
+#define BITMAP_BYTE_SIZE (1 << BITMAP_BYTE_SHIFT)
+#define BITMAP_SHIFT (BITMAP_BYTE_SHIFT + 3)
+#define BITMAP_SIZE (1 << BITMAP_SHIFT)
+#define BITMAP_MASK (BITMAP_SIZE - 1)
+
+struct tux3_idefer_map {
+ struct hlist_head heads[NODE_HASH_SIZE];
+};
+
+struct tux3_idefer_node {
+ struct hlist_node link;
+ block_t index;
+ unsigned count;
+ unsigned long bitmap[BITMAP_SIZE / sizeof(unsigned long)];
+};
+
+static struct kmem_cache *tux3_idefer_node_cachep;
+
+struct tux3_idefer_map *tux3_alloc_idefer_map(void)
+{
+ struct tux3_idefer_map *map;
+
+ map = malloc(sizeof(*map));
+ if (map) {
+ int i;
+ for (i = 0; i < NODE_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&map->heads[i]);
+ }
+ return map;
+}
+
+void tux3_free_idefer_map(struct tux3_idefer_map *map)
+{
+ if (map) {
+ int i;
+ for (i = 0; i < NODE_HASH_SIZE; i++)
+ assert(hlist_empty(&map->heads[i]));
+ free(map);
+ }
+}
+
+static void tux3_idefer_init_once(void *mem)
+{
+ struct tux3_idefer_node *node = mem;
+
+ INIT_HLIST_NODE(&node->link);
+ node->count = 0;
+ memset(node->bitmap, 0, sizeof(node->bitmap));
+}
+
+int __init tux3_init_idefer_cache(void)
+{
+ tux3_idefer_node_cachep = kmem_cache_create("tux3_idefer_node",
+ sizeof(struct tux3_idefer_node), 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ tux3_idefer_init_once);
+ if (tux3_idefer_node_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void tux3_destroy_idefer_cache(void)
+{
+ kmem_cache_destroy(tux3_idefer_node_cachep);
+}
+
+static struct tux3_idefer_node *idefer_alloc_node(block_t index)
+{
+ struct tux3_idefer_node *node;
+
+ node = kmem_cache_alloc(tux3_idefer_node_cachep, GFP_NOFS);
+ if (!node)
+ return NULL;
+
+ node->index = index;
+
+ return node;
+}
+
+static void idefer_free_node(struct tux3_idefer_node *node)
+{
+ kmem_cache_free(tux3_idefer_node_cachep, node);
+}
+
+static inline unsigned idefer_hash(block_t index)
+{
+ return hash_64(index, NODE_HASH_BITS);
+}
+
+static struct tux3_idefer_node *
+idefer_find_node(struct hlist_head *head, block_t index,
+ struct hlist_node **prev)
+{
+ struct tux3_idefer_node *node;
+
+ if (prev)
+ *prev = NULL;
+
+ hlist_for_each_entry(node, head, link) {
+ if (node->index < index) {
+ if (prev)
+ *prev = &node->link;
+ continue;
+ }
+ if (node->index == index)
+ return node;
+ if (node->index > index)
+ return NULL;
+ }
+
+ return NULL;
+}
+
+/* Set a bit for deferred inum */
+static int tux3_idefer_add(struct tux3_idefer_map *map, inum_t inum)
+{
+ block_t index = inum >> BITMAP_SHIFT;
+ unsigned offset = inum & BITMAP_MASK;
+ struct hlist_head *head = map->heads + idefer_hash(index);
+ struct tux3_idefer_node *node;
+ struct hlist_node *prev = NULL;
+
+ node = idefer_find_node(head, index, &prev);
+ if (!node) {
+ node = idefer_alloc_node(index);
+ if (!node)
+ return -ENOMEM;
+ if (prev)
+ hlist_add_after(prev, &node->link);
+ else
+ hlist_add_head(&node->link, head);
+ }
+
+ assert(!test_bit(offset, node->bitmap));
+ __set_bit(offset, node->bitmap);
+ node->count++;
+
+ return 0;
+}
+
+/* Clear a bit for deferred inum */
+static void tux3_idefer_del(struct tux3_idefer_map *map, inum_t inum)
+{
+ block_t index = inum >> BITMAP_SHIFT;
+ unsigned offset = inum & BITMAP_MASK;
+ struct hlist_head *head = map->heads + idefer_hash(index);
+ struct tux3_idefer_node *node;
+
+ node = idefer_find_node(head, index, NULL);
+ assert(node);
+ assert(test_bit(offset, node->bitmap));
+ __clear_bit(offset, node->bitmap);
+ node->count--;
+
+ if (node->count == 0) {
+ hlist_del(&node->link);
+ idefer_free_node(node);
+ }
+}
+
+/* Find free inum except deferred inums from specified range */
+static inum_t find_free(struct tux3_idefer_map *map, inum_t inum, inum_t limit)
+{
+ block_t limit_index = (limit + BITMAP_MASK) >> BITMAP_SHIFT;
+ block_t index = inum >> BITMAP_SHIFT;
+ unsigned offset = inum & BITMAP_MASK;
+
+ while (index < limit_index) {
+ struct hlist_head *head = map->heads + idefer_hash(index);
+ struct tux3_idefer_node *node;
+
+ node = idefer_find_node(head, index, NULL);
+ if (!node)
+ goto found;
+
+ if (node->count != BITMAP_SIZE) {
+ offset = find_next_zero_bit(node->bitmap, BITMAP_SIZE,
+ offset);
+ if (offset < BITMAP_SIZE)
+ goto found;
+ }
+
+ index++;
+ offset = 0;
+ }
+
+ return TUX_INVALID_INO;
+
+found:
+ return (index << BITMAP_SHIFT) + offset;
+}
+
+/* Find free inum except deferred inums */
+static inum_t tux3_idefer_find_free(struct tux3_idefer_map *map, inum_t start)
+{
+ inum_t inum;
+
+ inum = find_free(map, start, TUXKEY_LIMIT);
+ if (inum == TUX_INVALID_INO)
+ inum = find_free(map, TUX_NORMAL_INO, start);
+
+ assert(inum != TUX_INVALID_INO);
+ return inum;
+}
diff --git a/fs/tux3/super.c b/fs/tux3/super.c
index fb66234920475..140bcd593a43f 100644
--- a/fs/tux3/super.c
+++ b/fs/tux3/super.c
@@ -128,6 +128,8 @@ static void __tux3_put_super(struct sb *sbi)
/* Cleanup flusher after inode was evicted */
tux3_exit_flusher(sbi);
+ tux3_free_idefer_map(sbi->idefer_map);
+ sbi->idefer_map = NULL;
/* FIXME: add more sanity check */
assert(list_empty(&sbi->alloc_inodes));
assert(link_empty(&sbi->forked_buffers));
@@ -533,6 +535,10 @@ static int __init init_tux3(void)
if (err)
goto error_hole;
+ err = tux3_init_idefer_cache();
+ if (err)
+ goto error_idefer;
+
err = register_filesystem(&tux3_fs_type);
if (err)
goto error_fs;
@@ -540,9 +546,11 @@ static int __init init_tux3(void)
return 0;
error_fs:
- tux3_destroy_inodecache();
-error_hole:
+ tux3_destroy_idefer_cache();
+error_idefer:
tux3_destroy_hole_cache();
+error_hole:
+ tux3_destroy_inodecache();
error:
return err;
}
@@ -550,6 +558,7 @@ error:
static void __exit exit_tux3(void)
{
unregister_filesystem(&tux3_fs_type);
+ tux3_destroy_idefer_cache();
tux3_destroy_hole_cache();
tux3_destroy_inodecache();
}
diff --git a/fs/tux3/tux3.h b/fs/tux3/tux3.h
index b10dc6fd8189b..a9b9eff8148c7 100644
--- a/fs/tux3/tux3.h
+++ b/fs/tux3/tux3.h
@@ -232,6 +232,7 @@ struct countmap_pin {
struct buffer_head *buffer;
};
+struct tux3_idefer_map;
/* Tux3-specific sb is a handle for the entire volume state */
struct sb {
union {
@@ -310,6 +311,7 @@ struct sb {
*/
spinlock_t countmap_lock;
struct countmap_pin countmap_pin;
+ struct tux3_idefer_map *idefer_map;
struct list_head alloc_inodes; /* deferred inum allocation inodes */
spinlock_t forked_buffers_lock;
@@ -761,7 +763,7 @@ int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key, unsigned co
int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from, tuxkey_t to);
/* commit.c */
-void setup_sb(struct sb *sb, struct disksuper *super);
+int setup_sb(struct sb *sb, struct disksuper *super);
int load_sb(struct sb *sb);
int save_sb(struct sb *sb);
void tux3_start_backend(struct sb *sb);
@@ -846,6 +848,10 @@ struct inode *tux_new_volmap(struct sb *sb);
struct inode *tux_new_logmap(struct sb *sb);
struct inode *tux_new_inode(struct inode *dir, struct tux_iattr *iattr,
dev_t rdev);
+struct tux3_idefer_map *tux3_alloc_idefer_map(void);
+void tux3_free_idefer_map(struct tux3_idefer_map *map);
+int __init tux3_init_idefer_cache(void);
+void tux3_destroy_idefer_cache(void);
void del_defer_alloc_inum(struct inode *inode);
void cancel_defer_alloc_inum(struct inode *inode);
int tux_assign_inum(struct inode *inode, inum_t goal);