summaryrefslogtreecommitdiff
path: root/0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch
diff options
context:
space:
mode:
Diffstat (limited to '0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch')
-rw-r--r--0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch184
1 files changed, 184 insertions, 0 deletions
diff --git a/0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch b/0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch
new file mode 100644
index 0000000..0d02fab
--- /dev/null
+++ b/0005-btrfs-shrink-delalloc-pages-instead-of-full-inodes.patch
@@ -0,0 +1,184 @@
+From 0397aa4e6205543d9b3da11794037e94f8735867 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Mon, 4 Jan 2021 15:24:11 -0500
+Subject: btrfs: shrink delalloc pages instead of full inodes
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
+shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
+some infrastructure we have in place to flush inodes that we use for
+device replace and snapshot. However this introduced a pretty serious
+performance regression. The root cause is because before we would
+generally use the normal writeback path to reclaim delalloc space, and
+for this we would provide it with the number of pages we wanted to
+flush. The referenced commit changed this to flush that many inodes,
+which drastically increased the amount of space we were flushing in
+certain cases, which severely affected performance.
+
+We cannot revert this patch unfortunately, because Filipe has another
+fix that requires the ability to skip flushing inodes that are being
+cloned in certain scenarios, which means we need to keep using our
+flushing infrastructure or risk re-introducing the deadlock.
+
+Instead to fix this problem we can go back to providing
+btrfs_start_delalloc_roots with a number of pages to flush, and then set
+up a writeback_control and utilize sync_inode() to handle the flushing
+for us. This gives us the same behavior we had prior to the fix, while
+still allowing us to avoid the deadlock that was fixed by Filipe. The
+user reported reproducer was a simple untarring of a large tarball of
+the source code for Firefox. The results are as follows
+
+5.9 0m54.258s
+5.10 1m26.212s
+Patch 0m38.800s
+
+We are significantly faster because of the work I did around improving
+ENOSPC flushing in 5.10 and 5.11, so reverting to the previous write out
+behavior gave us a pretty big boost.
+
+CC: stable@vger.kernel.org # 5.10
+Reported-by: René Rebe <rene@exactcode.de>
+Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+---
+ fs/btrfs/inode.c | 60 ++++++++++++++++++++++++++++++++++++---------------
+ fs/btrfs/space-info.c | 4 +++-
+ 2 files changed, 46 insertions(+), 18 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 07479250221d..acc47e2ffb46 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -9389,7 +9389,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+-static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot,
++static int start_delalloc_inodes(struct btrfs_root *root,
++ struct writeback_control *wbc, bool snapshot,
+ bool in_reclaim_context)
+ {
+ struct btrfs_inode *binode;
+@@ -9398,6 +9399,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
+ struct list_head works;
+ struct list_head splice;
+ int ret = 0;
++ bool full_flush = wbc->nr_to_write == LONG_MAX;
+
+ INIT_LIST_HEAD(&works);
+ INIT_LIST_HEAD(&splice);
+@@ -9426,18 +9428,24 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
+ if (snapshot)
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &binode->runtime_flags);
+- work = btrfs_alloc_delalloc_work(inode);
+- if (!work) {
+- iput(inode);
+- ret = -ENOMEM;
+- goto out;
+- }
+- list_add_tail(&work->list, &works);
+- btrfs_queue_work(root->fs_info->flush_workers,
+- &work->work);
+- if (*nr != U64_MAX) {
+- (*nr)--;
+- if (*nr == 0)
++ if (full_flush) {
++ work = btrfs_alloc_delalloc_work(inode);
++ if (!work) {
++ iput(inode);
++ ret = -ENOMEM;
++ goto out;
++ }
++ list_add_tail(&work->list, &works);
++ btrfs_queue_work(root->fs_info->flush_workers,
++ &work->work);
++ } else {
++ ret = sync_inode(inode, wbc);
++ if (!ret &&
++ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
++ &BTRFS_I(inode)->runtime_flags))
++ ret = sync_inode(inode, wbc);
++ btrfs_add_delayed_iput(inode);
++ if (ret || wbc->nr_to_write <= 0)
+ goto out;
+ }
+ cond_resched();
+@@ -9463,18 +9471,29 @@ out:
+
+ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
+ {
++ struct writeback_control wbc = {
++ .nr_to_write = LONG_MAX,
++ .sync_mode = WB_SYNC_NONE,
++ .range_start = 0,
++ .range_end = LLONG_MAX,
++ };
+ struct btrfs_fs_info *fs_info = root->fs_info;
+- u64 nr = U64_MAX;
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ return -EROFS;
+
+- return start_delalloc_inodes(root, &nr, true, false);
++ return start_delalloc_inodes(root, &wbc, true, false);
+ }
+
+ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ bool in_reclaim_context)
+ {
++ struct writeback_control wbc = {
++ .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
++ .sync_mode = WB_SYNC_NONE,
++ .range_start = 0,
++ .range_end = LLONG_MAX,
++ };
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+@@ -9488,6 +9507,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice) && nr) {
++ /*
++ * Reset nr_to_write here so we know that we're doing a full
++ * flush.
++ */
++ if (nr == U64_MAX)
++ wbc.nr_to_write = LONG_MAX;
++
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_root(root);
+@@ -9496,9 +9522,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+- ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context);
++ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ btrfs_put_root(root);
+- if (ret < 0)
++ if (ret < 0 || wbc.nr_to_write <= 0)
+ goto out;
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 67e55c5479b8..e8347461c8dd 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -532,7 +532,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+
+ loops = 0;
+ while ((delalloc_bytes || dio_bytes) && loops < 3) {
+- btrfs_start_delalloc_roots(fs_info, items, true);
++ u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
++
++ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+
+ loops++;
+ if (wait_ordered && !trans) {
+--
+cgit v1.2.3-1-gf6bb5
+