ext4: do not mark inode dirty every time when appending using delalloc

In the delalloc append write scenario, if inode's i_size is extended due to buffer write, there are delalloc writes pending in the range up to i_size, and no need to touch i_disksize since writeback will push i_disksize up to i_size eventually. Offers significant performance improvement in high-frequency append write scenarios. I conducted tests in my 32-core environment by launching 32 concurrent threads to append write to the same file. Each write operation had a length of 1024 bytes and was repeated 100000 times. Without using this patch, the test was completed in 7705 ms. However, with this patch, the test was completed in 5066 ms, resulting in a performance improvement of 34%. Moreover, in test scenarios of Kafka version 2.6.2, using packet size of 2K, with this patch resulted in a 10% performance improvement. Signed-off-by: Liu Song <liusong@linux.alibaba.com> Suggested-by: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20230810154333.84921-1-liusong@linux.alibaba.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
author: Liu Song <liusong@linux.alibaba.com> 2023-08-10 23:43:33 +0800
committer: Theodore Ts'o <tytso@mit.edu> 2023-08-27 11:27:13 -0400
commit: 03de20bed203b0819225d4de98353c1f8755a1dd (patch)
tree: 1e0695f0f3e30275bc009d6f90816a1b895659de /fs/ext4/inode.c
parent: bb15cea20f211e110150e528fca806f38d5789e0 (diff)
download: linux-03de20bed203b0819225d4de98353c1f8755a1dd.tar.gz
1 files changed, 62 insertions, 26 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1b9003840bc16..c5d8f8933c8c6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2935,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio,
 	return 1;
 }
 
+static int ext4_da_do_write_end(struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page)
+{
+	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
+	bool disksize_changed = false;
+	loff_t new_i_size;
+
+	/*
+	 * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
+	 * flag, which all that's needed to trigger page writeback.
+	 */
+	copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
+	new_i_size = pos + copied;
+
+	/*
+	 * It's important to update i_size while still holding page lock,
+	 * because page writeout could otherwise come in and zero beyond
+	 * i_size.
+	 *
+	 * Since we are holding inode lock, we are sure i_disksize <=
+	 * i_size. We also know that if i_disksize < i_size, there are
+	 * delalloc writes pending in the range up to i_size. If the end of
+	 * the current write is <= i_size, there's no need to touch
+	 * i_disksize since writeback will push i_disksize up to i_size
+	 * eventually. If the end of the current write is > i_size and
+	 * inside an allocated block which ext4_da_should_update_i_disksize()
+	 * checked, we need to update i_disksize here as certain
+	 * ext4_writepages() paths not allocating blocks and update i_disksize.
+	 */
+	if (new_i_size > inode->i_size) {
+		unsigned long end;
+
+		i_size_write(inode, new_i_size);
+		end = (new_i_size - 1) & (PAGE_SIZE - 1);
+		if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
+			ext4_update_i_disksize(inode, new_i_size);
+			disksize_changed = true;
+		}
+	}
+
+	unlock_page(page);
+	put_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+
+	if (disksize_changed) {
+		handle_t *handle;
+
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	return copied;
+}
+
 static int ext4_da_write_end(struct file *file,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	loff_t new_i_size;
-	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
 	struct folio *folio = page_folio(page);
 
@@ -2961,30 +3020,7 @@ static int ext4_da_write_end(struct file *file,
 	if (unlikely(copied < len) && !PageUptodate(page))
 		copied = 0;
 
-	start = pos & (PAGE_SIZE - 1);
-	end = start + copied - 1;
-
-	/*
-	 * Since we are holding inode lock, we are sure i_disksize <=
-	 * i_size. We also know that if i_disksize < i_size, there are
-	 * delalloc writes pending in the range upto i_size. If the end of
-	 * the current write is <= i_size, there's no need to touch
-	 * i_disksize since writeback will push i_disksize upto i_size
-	 * eventually. If the end of the current write is > i_size and
-	 * inside an allocated block (ext4_da_should_update_i_disksize()
-	 * check), we need to update i_disksize here as certain
-	 * ext4_writepages() paths not allocating blocks update i_disksize.
-	 *
-	 * Note that we defer inode dirtying to generic_write_end() /
-	 * ext4_da_write_inline_data_end().
-	 */
-	new_i_size = pos + copied;
-	if (copied && new_i_size > inode->i_size &&
-	    ext4_da_should_update_i_disksize(folio, end))
-		ext4_update_i_disksize(inode, new_i_size);
-
-	return generic_write_end(file, mapping, pos, len, copied, &folio->page,
-				 fsdata);
+	return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
 }
 
 /*
author	Liu Song <liusong@linux.alibaba.com>	2023-08-10 23:43:33 +0800
committer	Theodore Ts'o <tytso@mit.edu>	2023-08-27 11:27:13 -0400
commit	03de20bed203b0819225d4de98353c1f8755a1dd (patch)
tree	1e0695f0f3e30275bc009d6f90816a1b895659de /fs/ext4/inode.c
parent	bb15cea20f211e110150e528fca806f38d5789e0 (diff)
download	linux-03de20bed203b0819225d4de98353c1f8755a1dd.tar.gz