LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
@ 2004-05-14  0:42 Theodore Ts'o
  2004-05-14  2:53 ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Ts'o @ 2004-05-14  0:42 UTC (permalink / raw)
  To: linux-kernel, ext2-devel


It is possible for block allocation to fail, even if there is space in
the filesystem, because all of the free blocks were recently deleted and
so could not be allocated until after the currently running transaction
is committed.   This can result in a very strange and surprising result
where a system call such as a mkdir() will fail even though there is
plenty of disk space apparently available.

This patch detects the case where a system call is apparently going to
fail due to ENOSPC, even though there is space available, and forces the
currently running transaction to complete before retrying the operation.

Unfortunately, it is necessary to hit so many high-level routines,
instead of making a smaller change in a single low-level route, such as
ext3_new_block(), because if we try to wait for the currently running
transaction to complete while we are holding an active handle associated
with that transaction, it will result in a deadlock.  Hence the retry
has to happen outside of the journal_start().... journal_stop() code
path.

						- Ted


===== fs/ext3/balloc.c 1.20 vs edited =====
--- 1.20/fs/ext3/balloc.c	Mon Feb 23 00:24:13 2004
+++ edited/fs/ext3/balloc.c	Thu May 13 14:57:38 2004
@@ -465,6 +465,64 @@
 	return -1;
 }
 
+static int ext3_has_free_blocks(struct super_block *sb)
+{
+	struct ext3_super_block *es;
+	struct ext3_sb_info *sbi;
+	int free_blocks, root_blocks;
+
+	sbi = EXT3_SB(sb);
+	es = EXT3_SB(sb)->s_es;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * This function is called when an allocation is failed, and will
+ * return true if it is profitable to retry an allocation.  There must
+ * *NOT* be a currently active handle, so the retry has to happen at
+ * the top-level ext3 function.  This is because this function will
+ * attempt to force the currently running transaction to the log, and
+ * block until it is completed.  If the current process is holding an
+ * active handle, this will cause a deadlock.
+ */
+int ext3_should_retry_alloc(struct super_block *sb)
+{
+	transaction_t *transaction = NULL;
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+	tid_t tid;
+
+	if (!ext3_has_free_blocks(sb))
+		return 0;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return 0;	/* Nothing to retry */
+	}
+		
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	log_wait_commit(journal, tid);
+
+	return 1;
+}
+
 /*
  * ext3_new_block uses a goal block to assist allocation.  If the goal is
  * free, or there is a free block within 32 blocks of the goal, that block
@@ -485,7 +543,8 @@
 	int target_block;			/* tmp */
 	int fatal = 0, err;
 	int performed_allocation = 0;
-	int free_blocks, root_blocks;
+	int free_blocks;
+	int num_free_bgroups = 0;
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -512,11 +571,7 @@
 	es = EXT3_SB(sb)->s_es;
 	ext3_debug("goal=%lu.\n", goal);
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = le32_to_cpu(es->s_r_blocks_count);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+	if (!ext3_has_free_blocks(sb)) {
 		*errp = -ENOSPC;
 		goto out;
 	}
@@ -565,6 +620,7 @@
 		if (free_blocks <= 0)
 			continue;
 
+		num_free_bgroups++;
 		brelse(bitmap_bh);
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
@@ -575,6 +631,10 @@
 			goto out;
 		if (ret_block >= 0) 
 			goto allocated;
+	}
+
+	if (num_free_bgroups) {
+		jbd_debug(1, "%s: %d free blockgroups, but couldn't find free blocks\n", sb->s_id, num_free_bgroups);
 	}
 
 	/* No space left on the device */
===== fs/ext3/acl.c 1.16 vs edited =====
--- 1.16/fs/ext3/acl.c	Fri Mar 12 04:33:00 2004
+++ edited/fs/ext3/acl.c	Thu May 13 15:18:23 2004
@@ -428,7 +428,9 @@
 	error = posix_acl_chmod_masq(clone, inode->i_mode);
 	if (!error) {
 		handle_t *handle;
+		int retry = 0;
 
+	retry:
 		handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
@@ -437,6 +439,9 @@
 		}
 		error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
 		ext3_journal_stop(handle);
+		if (error == -ENOSPC && 
+		    ext3_should_retry_alloc(inode->i_sb) && retry++ <= 3)
+			goto retry;
 	}
 out:
 	posix_acl_release(clone);
@@ -516,7 +521,7 @@
 {
 	handle_t *handle;
 	struct posix_acl *acl;
-	int error;
+	int error, retry = 0;
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
@@ -535,11 +540,15 @@
 	} else
 		acl = NULL;
 
+retry:
 	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	error = ext3_set_acl(handle, inode, type, acl);
 	ext3_journal_stop(handle);
+	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 
 release_and_out:
 	posix_acl_release(acl);
===== fs/ext3/inode.c 1.94 vs edited =====
--- 1.94/fs/ext3/inode.c	Thu Apr 22 19:20:51 2004
+++ edited/fs/ext3/inode.c	Thu May 13 15:12:45 2004
@@ -1080,8 +1080,10 @@
 {
 	struct inode *inode = page->mapping->host;
 	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+	int retry = 0;
 	handle_t *handle;
 
+retry:
 	handle = ext3_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -1098,6 +1100,9 @@
 prepare_write_failed:
 	if (ret)
 		ext3_journal_stop(handle);
+	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 out:
 	return ret;
 }
===== fs/ext3/namei.c 1.51 vs edited =====
--- 1.51/fs/ext3/namei.c	Wed Apr 14 21:37:52 2004
+++ edited/fs/ext3/namei.c	Thu May 13 14:59:37 2004
@@ -1628,8 +1628,9 @@
 {
 	handle_t *handle; 
 	struct inode * inode;
-	int err;
+	int err, retry = 0;
 
+retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
 					2*EXT3_QUOTA_INIT_BLOCKS);
@@ -1648,6 +1649,9 @@
 		err = ext3_add_nondir(handle, dentry, inode);
 	}
 	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 	return err;
 }
 
@@ -1656,11 +1660,12 @@
 {
 	handle_t *handle;
 	struct inode *inode;
-	int err;
+	int err, retry = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
 			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
 					2*EXT3_QUOTA_INIT_BLOCKS);
@@ -1680,6 +1685,9 @@
 		err = ext3_add_nondir(handle, dentry, inode);
 	}
 	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 	return err;
 }
 
@@ -1689,11 +1697,13 @@
 	struct inode * inode;
 	struct buffer_head * dir_block;
 	struct ext3_dir_entry_2 * de;
+	int retry = 0;
 	int err;
 
 	if (dir->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
 
+retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
 					2*EXT3_QUOTA_INIT_BLOCKS);
@@ -1751,6 +1761,9 @@
 	d_instantiate(dentry, inode);
 out_stop:
 	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 	return err;
 }
 
@@ -2085,12 +2098,13 @@
 {
 	handle_t *handle;
 	struct inode * inode;
-	int l, err;
+	int l, err, retry = 0;
 
 	l = strlen(symname)+1;
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
+retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
 			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
 					2*EXT3_QUOTA_INIT_BLOCKS);
@@ -2129,6 +2143,9 @@
 	err = ext3_add_nondir(handle, dentry, inode);
 out_stop:
 	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 	return err;
 }
 
@@ -2137,11 +2154,12 @@
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
-	int err;
+	int err, retry = 0;
 
 	if (inode->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
 
+retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
 	if (IS_ERR(handle))
@@ -2156,6 +2174,9 @@
 
 	err = ext3_add_nondir(handle, dentry, inode);
 	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb) && 
+	    retry++ <= 3)
+		goto retry;
 	return err;
 }
 
===== fs/ext3/xattr.c 1.27 vs edited =====
--- 1.27/fs/ext3/xattr.c	Fri Feb  6 03:30:14 2004
+++ edited/fs/ext3/xattr.c	Thu May 13 15:18:39 2004
@@ -875,8 +875,9 @@
 	       const void *value, size_t value_len, int flags)
 {
 	handle_t *handle;
-	int error;
+	int error, retry = 0;
 
+retry:
 	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -885,7 +886,13 @@
 
 		error = ext3_xattr_set_handle(handle, inode, name_index, name,
 					      value, value_len, flags);
+
 		error2 = ext3_journal_stop(handle);
+
+		if (error == -ENOSPC && 
+		    ext3_should_retry_alloc(inode->i_sb) && retry++ <= 3)
+			goto retry;
+
 		if (error == 0)
 			error = error2;
 	}
===== include/linux/ext3_fs.h 1.30 vs edited =====
--- 1.30/include/linux/ext3_fs.h	Fri Aug  1 05:31:19 2003
+++ edited/include/linux/ext3_fs.h	Thu May 13 14:58:51 2004
@@ -689,6 +689,7 @@
 extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 						    unsigned int block_group,
 						    struct buffer_head ** bh);
+extern int ext3_should_retry_alloc(struct super_block *sb);
 
 /* dir.c */
 extern int ext3_check_dir_entry(const char *, struct inode *,

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14  0:42 [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit Theodore Ts'o
@ 2004-05-14  2:53 ` Andrew Morton
  2004-05-14  4:37   ` [Ext2-devel] " Theodore Ts'o
  2004-05-19 22:04   ` question about ext3_find_goal with reservation Mingming Cao
  0 siblings, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2004-05-14  2:53 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: linux-kernel, ext2-devel

"Theodore Ts'o" <tytso@mit.edu> wrote:
>
> It is possible for block allocation to fail, even if there is space in
>  the filesystem, because all of the free blocks were recently deleted and
>  so could not be allocated until after the currently running transaction
>  is committed.   This can result in a very strange and surprising result
>  where a system call such as a mkdir() will fail even though there is
>  plenty of disk space apparently available.

I merged a little patch for this into post-2.6.6, but that only addresses
prepare_write().

I wonder if there's much value in having ext3_has_free_blocks()?  We could
just retry three times even if the fs is really full?

I'd be inclined to pass a pointer to the `retry' counter into
ext3_should_retry_alloc() and implement the counting in there.  It'll tidy
things up a bit and consolidates that piece of policy in a single place.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14  2:53 ` Andrew Morton
@ 2004-05-14  4:37   ` Theodore Ts'o
  2004-05-14  4:49     ` Andrew Morton
  2004-05-19 22:04   ` question about ext3_find_goal with reservation Mingming Cao
  1 sibling, 1 reply; 11+ messages in thread
From: Theodore Ts'o @ 2004-05-14  4:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ext2-devel

On Thu, May 13, 2004 at 07:53:10PM -0700, Andrew Morton wrote:
> "Theodore Ts'o" <tytso@mit.edu> wrote:
> >
> > It is possible for block allocation to fail, even if there is space in
> >  the filesystem, because all of the free blocks were recently deleted and
> >  so could not be allocated until after the currently running transaction
> >  is committed.   This can result in a very strange and surprising result
> >  where a system call such as a mkdir() will fail even though there is
> >  plenty of disk space apparently available.
> 
> I merged a little patch for this into post-2.6.6, but that only addresses
> prepare_write().

Oh, sorry, I didn't see that patch.  The specific bug I was trying to
fix was actually in mkdir though (the regression test suite did the
equivalent of rm -rf /mntpt/*, followed by mkdir's which failed).

I can respin the patch versus BK-latest.

> I wonder if there's much value in having ext3_has_free_blocks()?  We could
> just retry three times even if the fs is really full?

We could; it's an error path, after all.  On the other hand, the cost
of ext3_has_free_blocks() is pretty small, and if we know that there's
no point doing the retry, why not skip the work.

> I'd be inclined to pass a pointer to the `retry' counter into
> ext3_should_retry_alloc() and implement the counting in there.  It'll tidy
> things up a bit and consolidates that piece of policy in a single place.

Yes, good point.

						- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14  4:37   ` [Ext2-devel] " Theodore Ts'o
@ 2004-05-14  4:49     ` Andrew Morton
  2004-05-14 17:48       ` Andreas Dilger
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2004-05-14  4:49 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: linux-kernel, ext2-devel

"Theodore Ts'o" <tytso@mit.edu> wrote:
>
> On Thu, May 13, 2004 at 07:53:10PM -0700, Andrew Morton wrote:
> > "Theodore Ts'o" <tytso@mit.edu> wrote:
> > >
> > > It is possible for block allocation to fail, even if there is space in
> > >  the filesystem, because all of the free blocks were recently deleted and
> > >  so could not be allocated until after the currently running transaction
> > >  is committed.   This can result in a very strange and surprising result
> > >  where a system call such as a mkdir() will fail even though there is
> > >  plenty of disk space apparently available.
> > 
> > I merged a little patch for this into post-2.6.6, but that only addresses
> > prepare_write().
> 
> Oh, sorry, I didn't see that patch.

Andreas's patch is a bit sneaky: it simply sets ->h_sync on the current
transaction then does journal_stop().  I think your patch can do the same
thing?

again:
	handle = ext3_journal_start(...);

	...

	if (err == -ENOSPC && ext3_should_retry_alloc(inode, handle, &retry)) {
		goto again;
	} else {
		err2 = ext3_journal_stop(handle);
		if (!err)
			err = err2;
	}
	return err;

Something like that.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14  4:49     ` Andrew Morton
@ 2004-05-14 17:48       ` Andreas Dilger
  2004-05-14 19:59         ` Theodore Ts'o
  0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2004-05-14 17:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Theodore Ts'o, linux-kernel, ext2-devel

On May 13, 2004  21:49 -0700, Andrew Morton wrote:
> Andreas's patch is a bit sneaky: it simply sets ->h_sync on the current
> transaction then does journal_stop().  I think your patch can do the same
> thing?

Well, actually my patch just waited on the _previous_ transaction to commit
(which can be done anywhere without retrying the operation) and then set
h_sync on the _current_ transaction so that as soon as the current operations
are completed it will also be committed and the blocks released.  One can't
of course arbitrarily call journal_stop() or that breaks the transaction
atomicity.

For 99.9% of cases this should be sufficient and doesn't involve changing
the code everywhere - only in ext3_new_block().  Also, Ted's approach
of retrying the operations "outside" the transaction won't work if there
are nested journal transactions being done - those will hold the transaction
open so doing journal_stop/journal_start doesn't really accomplish anything.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14 17:48       ` Andreas Dilger
@ 2004-05-14 19:59         ` Theodore Ts'o
  2004-05-14 21:06           ` Andreas Dilger
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Ts'o @ 2004-05-14 19:59 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel, ext2-devel

On Fri, May 14, 2004 at 11:48:37AM -0600, Andreas Dilger wrote:
> 
> Well, actually my patch just waited on the _previous_ transaction to commit
> (which can be done anywhere without retrying the operation) and then set
> h_sync on the _current_ transaction so that as soon as the current operations
> are completed it will also be committed and the blocks released.  One can't
> of course arbitrarily call journal_stop() or that breaks the transaction
> atomicity.
> 
> For 99.9% of cases this should be sufficient and doesn't involve changing
> the code everywhere - only in ext3_new_block().  

I tried that first, actually.  In the test case I was trying, it only
worked 33% of the time.  The other 66% of the time, the rm -rf all fit
into the current running transaction, and waiting on the previous
transaction wasn't sufficient to solve the problem.

> Also, Ted's approach of retrying the operations "outside" the
> transaction won't work if there are nested journal transactions
> being done - those will hold the transaction open so doing
> journal_stop/journal_start doesn't really accomplish anything.

That was why I was retrying at the top-level functions: ext3_mkdir,
for example.  There won't be a nested journal transaction there.  This
will be an issue for prepare_write(), though.  If we are in nested
transaction, we're going to have to wait for the currently committing
transaction, and hope for the best.  If that's not sufficient, we're
going to have return the ENOSPC failure.

						- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14 19:59         ` Theodore Ts'o
@ 2004-05-14 21:06           ` Andreas Dilger
  2004-05-15 13:18             ` Theodore Ts'o
  0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2004-05-14 21:06 UTC (permalink / raw)
  To: Theodore Ts'o, Andrew Morton, linux-kernel, ext2-devel

On May 14, 2004  15:59 -0400, Theodore Ts'o wrote:
> On Fri, May 14, 2004 at 11:48:37AM -0600, Andreas Dilger wrote:
> > Well, actually my patch just waited on the _previous_ transaction to commit
> > (which can be done anywhere without retrying the operation) and then set
> > h_sync on the _current_ transaction so that as soon as the current operations
> > are completed it will also be committed and the blocks released.  One can't
> > of course arbitrarily call journal_stop() or that breaks the transaction
> > atomicity.
> > 
> > For 99.9% of cases this should be sufficient and doesn't involve changing
> > the code everywhere - only in ext3_new_block().  
> 
> I tried that first, actually.  In the test case I was trying, it only
> worked 33% of the time.  The other 66% of the time, the rm -rf all fit
> into the current running transaction, and waiting on the previous
> transaction wasn't sufficient to solve the problem.

I guess the success ratio dependends on how many blocks are tied up in
the transaction, the size of the journal, and how much free space is
left in the filesystem.  In my tests (dd to a file that does O_TRUNC and
overwrites with the same file size) this change wasn't 100% successful
but fixed it the majority of the time.

> > Also, Ted's approach of retrying the operations "outside" the
> > transaction won't work if there are nested journal transactions
> > being done - those will hold the transaction open so doing
> > journal_stop/journal_start doesn't really accomplish anything.
> 
> That was why I was retrying at the top-level functions: ext3_mkdir,
> for example.  There won't be a nested journal transaction there.

Waiting for the currently committing transaction to complete would
deadlock Lustre, because it starts journal transactions above ext3 so
that it can write update records in the same transaction as the
filesystem operation.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit
  2004-05-14 21:06           ` Andreas Dilger
@ 2004-05-15 13:18             ` Theodore Ts'o
  0 siblings, 0 replies; 11+ messages in thread
From: Theodore Ts'o @ 2004-05-15 13:18 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel, ext2-devel

On Fri, May 14, 2004 at 03:06:22PM -0600, Andreas Dilger wrote:
> > I tried that first, actually.  In the test case I was trying, it only
> > worked 33% of the time.  The other 66% of the time, the rm -rf all fit
> > into the current running transaction, and waiting on the previous
> > transaction wasn't sufficient to solve the problem.
> 
> I guess the success ratio dependends on how many blocks are tied up in
> the transaction, the size of the journal, and how much free space is
> left in the filesystem.  In my tests (dd to a file that does O_TRUNC and
> overwrites with the same file size) this change wasn't 100% successful
> but fixed it the majority of the time.

In my case, the filesystem was completely full, and we were doing an
"rm -rf /mntpt/*", followed by a series of mkdir to set up the test
directories.  We were failing on the mkdir approximately 2/3'rds of
the time.  

> > That was why I was retrying at the top-level functions: ext3_mkdir,
> > for example.  There won't be a nested journal transaction there.
> 
> Waiting for the currently committing transaction to complete would
> deadlock Lustre, because it starts journal transactions above ext3 so
> that it can write update records in the same transaction as the
> filesystem operation.

Right, I had forgotten about Lustre.  :-) I was only worrying about
the nested transaction case of writing to the quota file.  So what we
can do is only do the do the log_wait_commit (and retry the
transaction) if current->journal_info is NULL --- i.e., only when the
current process does not have a currently active handle.  If not, we
return -ENOSPC, and let the top-level caller (Lustre in your case)
retry the entire operation.

I think that should be sufficient to keep Lustre happy.

						- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* question about ext3_find_goal with reservation
  2004-05-14  2:53 ` Andrew Morton
  2004-05-14  4:37   ` [Ext2-devel] " Theodore Ts'o
@ 2004-05-19 22:04   ` Mingming Cao
  2004-05-19 22:52     ` Andrew Morton
  1 sibling, 1 reply; 11+ messages in thread
From: Mingming Cao @ 2004-05-19 22:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Theodore Ts'o, linux-kernel, ext2-devel

I just wondering, how the goal block being determined in ext3?

If we are doing sequential write, (the new logical block is next to the
last logical block), it's easy. Just increase the last physical block
recorded in i_next_alloc_goal and take that as the goal block for
ext3_new_block() to try to do allocation there.

If the pattern is random write, in the current implementation(with the
goal fix), the ext3_find_near() will find a goal with good locality.( I
have a hard time understand the ext3_find_near(), need some help
here...)

Well I am thinking, with the ext3 reservation changes, would it make
sense that to find the goal in the reservation window, if we are doing
random write on a inode, and we have a reservation window for that
inode? In another words, when we try to find a goal for block allocation
and the write pattern is non-sequential, if the the filesystem has
reservation turned on, before we try to find a goal somewhere else,
shouldn't we try to locate the goal inside the reservation window first?

This will avoid the problem that a reservation window for an inode being
throwed away frequently when a single process open an file doing lseek
and write frequently? (We have discussed this before.)

With reservation, we probably don't need ext3_find_near() to guide us to
find a goal block. But we still need that in the case the filesystem is
mounted without reservation on.

If all above make sense, then the goal should be the start block of the
reservation window. So when ext3_new_block() try to satisfy the goal
block, it will try to do allocation inside the reservation window
directly. This makes the goal outside of reservation almost impossible
in ext3_try_to_allocate_with_rsv(), unless it is doing sequential writes
and the goal is just next to the end of the reservation.

Any thoughts?

Mingming



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: question about ext3_find_goal with reservation
  2004-05-19 22:04   ` question about ext3_find_goal with reservation Mingming Cao
@ 2004-05-19 22:52     ` Andrew Morton
  2004-05-20  1:04       ` [Ext2-devel] " Mingming Cao
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2004-05-19 22:52 UTC (permalink / raw)
  To: Mingming Cao; +Cc: tytso, linux-kernel, ext2-devel

Mingming Cao <cmm@us.ibm.com> wrote:
>
> I just wondering, how the goal block being determined in ext3?
> 
> If we are doing sequential write, (the new logical block is next to the
> last logical block), it's easy. Just increase the last physical block
> recorded in i_next_alloc_goal and take that as the goal block for
> ext3_new_block() to try to do allocation there.
> 
> If the pattern is random write, in the current implementation(with the
> goal fix), the ext3_find_near() will find a goal with good locality.( I
> have a hard time understand the ext3_find_near(), need some help
> here...)

The comments in ext3_find_near() pretty well cover things.

The "colouring" implementation in there is to address some problems which
were exacerbated by the Orlov inode allocator.  When we're determining the
initial block for a new file the allocator was essentially doing first-fit
within the blockgroup.  So multiple files kept on having their blocks
jumbled up.

So the colouring essentially tries to start a new file at a random position
in its blockgroup.  But it ensures that if one task is creating a lot of
files (say, an untar), these are all nicely contiguous within the
blockgroup.  That's why the pid was used as the colour.  A reasonable
heuristic.  It improves the many-processes-creating-fiels problem, but
doesn't do much for the one-process-creates-lots-of-slowly-growing-files
problem.


> Well I am thinking, with the ext3 reservation changes, would it make
> sense that to find the goal in the reservation window, if we are doing
> random write on a inode, and we have a reservation window for that
> inode? In another words, when we try to find a goal for block allocation
> and the write pattern is non-sequential, if the the filesystem has
> reservation turned on, before we try to find a goal somewhere else,
> shouldn't we try to locate the goal inside the reservation window first?

I think that'll probabyl work OK.  If the application is seekily growing
the file the layout is awful already.

> This will avoid the problem that a reservation window for an inode being
> throwed away frequently when a single process open an file doing lseek
> and write frequently? (We have discussed this before.)

Yes, we definitely want to avoid that CPU-intensive search on every write.

> With reservation, we probably don't need ext3_find_near() to guide us to
> find a goal block. But we still need that in the case the filesystem is
> mounted without reservation on.

You might need it for the very first block allocation.  For example, if the
app opens an existing file and starts appending to it, does the current
code correctly commence allocation immediately beyond the file's final
block?

> If all above make sense, then the goal should be the start block of the
> reservation window.

Well.  It'll be some block within the reservation window - the code should
be recording how far across the reservation window we currently are.  We
don't want to do a linear search across the entire reservation window for
each block allocation attempt.

> So when ext3_new_block() try to satisfy the goal
> block, it will try to do allocation inside the reservation window
> directly. This makes the goal outside of reservation almost impossible
> in ext3_try_to_allocate_with_rsv(), unless it is doing sequential writes
> and the goal is just next to the end of the reservation.
> 
> Any thoughts?

It would be nice to separate the old allocation things (i_alloc_block,
i_alloc_goal, etc) from the new code completely.  Making one somehow
dependent upon or aware fo the other is confusing, and ultimately we'd like
to remove those fields from the inode altogether.

So maybe it's best to avoid calling ext3_find_near() and ext3_find_goal()
at all if reservations are enabled.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Ext2-devel] Re: question about ext3_find_goal with reservation
  2004-05-19 22:52     ` Andrew Morton
@ 2004-05-20  1:04       ` Mingming Cao
  0 siblings, 0 replies; 11+ messages in thread
From: Mingming Cao @ 2004-05-20  1:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: tytso, linux-kernel, ext2-devel

On Wed, 2004-05-19 at 15:52, Andrew Morton wrote:
> Mingming Cao <cmm@us.ibm.com> wrote:
> > If the pattern is random write, in the current implementation(with the
> > goal fix), the ext3_find_near() will find a goal with good locality.( I
> > have a hard time understand the ext3_find_near(), need some help
> > here...)
> 
> The comments in ext3_find_near() pretty well cover things.
Thanks.
> 
> > With reservation, we probably don't need ext3_find_near() to guide us to
> > find a goal block. But we still need that in the case the filesystem is
> > mounted without reservation on.
> 
> You might need it for the very first block allocation.  For example, if the
> app opens an existing file and starts appending to it, does the current
> code correctly commence allocation immediately beyond the file's final
> block?

Good point. I will check.
> 
> > If all above make sense, then the goal should be the start block of the
> > reservation window.
> 
> Well.  It'll be some block within the reservation window - the code should
> be recording how far across the reservation window we currently are.  We
> don't want to do a linear search across the entire reservation window for
> each block allocation attempt.
Currently we don't trace the last-allocated-block-from-reservation
window, but it's doable. It's kind of duplicated with the
i_next_alloc_goal

> It would be nice to separate the old allocation things (i_alloc_block,
> i_alloc_goal, etc) from the new code completely.  Making one somehow
> dependent upon or aware fo the other is confusing, and ultimately we'd like
> to remove those fields from the inode altogether.
> 
yes, it's nice to do so.  I will think about it.


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-05-21 23:46 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-05-14  0:42 [RFC/RFT] [PATCH] EXT3: Retry allocation after journal commit Theodore Ts'o
2004-05-14  2:53 ` Andrew Morton
2004-05-14  4:37   ` [Ext2-devel] " Theodore Ts'o
2004-05-14  4:49     ` Andrew Morton
2004-05-14 17:48       ` Andreas Dilger
2004-05-14 19:59         ` Theodore Ts'o
2004-05-14 21:06           ` Andreas Dilger
2004-05-15 13:18             ` Theodore Ts'o
2004-05-19 22:04   ` question about ext3_find_goal with reservation Mingming Cao
2004-05-19 22:52     ` Andrew Morton
2004-05-20  1:04       ` [Ext2-devel] " Mingming Cao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).