LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC] Parallelize IO for e2fsck
       [not found] <70b6f0bf0801161322k2740a8dch6a0d6e6e112cd2d0@mail.gmail.com>
@ 2008-01-16 21:30 ` Valerie Henson
  2008-01-18  1:15   ` David Chinner
  2008-01-21 23:00   ` Andreas Dilger
  0 siblings, 2 replies; 29+ messages in thread
From: Valerie Henson @ 2008-01-16 21:30 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, linux-kernel
  Cc: Theodore Ts'o, Andreas Dilger, Ric Wheeler

[-- Attachment #1: Type: text/plain, Size: 2634 bytes --]

Hi y'all,

This is a request for comments on the rewrite of the e2fsck IO
parallelization patches I sent out a few months ago.  The mechanism is
totally different.  Previously IO was parallelized by issuing IOs from
multiple threads; now a single thread issues fadvise(WILLNEED) and
then uses read() to complete the IO.

Single disk performance doesn't change, but elapsed time drops by
about 50% on a big RAID-5 box.  Passes 1 and 2 are parallelized.  Pass
5 is left as an exercise for the reader.

Many thanks to the Lustre folks for their fadvise readahead patch
which this patch uses and for comments and help in general.  Our good
friends at EMC Centera funded this work.

Here are the top things I'd like feedback on:

How to split up the patch?  My take:

* Indirect block only flag for iterate
* IO manager readahead/release functions
* Readahead infrastructure (readahead.c and related)
* Readahead calls for pass 1
* Readahead calls for pass 2

Killing readahead properly is hard.  I implemented it several ways and
didn't like any of them.  The current solution is still racy and
completely untested.

The whole thing needs to be autoconfed correctly.  Bah.

The user interface kinda sucks.  It should at least take arguments of
the form "128KB" or "52m" instead of number of file system blocks.
Guessing the right amount of buffer cache to use and io requests to
issue would also be good.

ext2fs_get_next_inode_ptr() - With readahead, copying the inode in
ext2fs_get_next_inode_full() costs about 2-3% of elapsed time.  This
is a hacked up version that just returns a pointer to the inode.

The patch is against e2fsprogs 1.40.4 and is attached.  Future patches
will be split up and sent via quilt.

Thanks!

-VAL

 e2fsck/Makefile.in                      |    6
 e2fsck/e2fsck.h                         |    5
 e2fsck/pass1.c                          |   36
 e2fsck/pass2.c                          |   12
 e2fsck/unix.c                           |   18
 lib/ext2fs/readahead.c                  | 1607 ++++++++++++++++++++++++++++++++
 lib/ext2fs/Makefile.in                  |    1
 lib/ext2fs/block.c                      |   20
 lib/ext2fs/dosio.c                      |    2
 lib/ext2fs/ext2_io.h                    |   10
 lib/ext2fs/ext2fs.h                     |   42
 lib/ext2fs/inode.c                      |   70 +
 lib/ext2fs/inode_io.c                   |    2
 lib/ext2fs/io_manager.c                 |   12
 lib/ext2fs/nt_io.c                      |    2
 lib/ext2fs/test_io.c                    |    2
 lib/ext2fs/unix_io.c                    |   41
 17 files changed, 1873 insertions(+), 15 deletions(-)

[-- Attachment #2: readahead --]
[-- Type: application/octet-stream, Size: 68339 bytes --]

--- e2fsprogs-1.40.4.orig/lib/ext2fs/ext2_io.h
+++ e2fsprogs-1.40.4/lib/ext2fs/ext2_io.h
@@ -63,6 +63,10 @@ struct struct_io_manager {
 	errcode_t (*set_blksize)(io_channel channel, int blksize);
 	errcode_t (*read_blk)(io_channel channel, unsigned long block,
 			      int count, void *data);
+	errcode_t (*readahead)(io_channel channel, unsigned long block,
+			       int count);
+	errcode_t (*cache_release)(io_channel channel, unsigned long block,
+				   int count);
 	errcode_t (*write_blk)(io_channel channel, unsigned long block,
 			       int count, const void *data);
 	errcode_t (*flush)(io_channel channel);
@@ -82,6 +86,8 @@ struct struct_io_manager {
 #define io_channel_close(c) 		((c)->manager->close((c)))
 #define io_channel_set_blksize(c,s)	((c)->manager->set_blksize((c),s))
 #define io_channel_read_blk(c,b,n,d)	((c)->manager->read_blk((c),b,n,d))
+#define io_channel_readahead(c,b,n)	((c)->manager->readahead((c),b,n))
+#define io_channel_cache_release(c,b,n)	((c)->manager->cache_release((c),b,n))
 #define io_channel_write_blk(c,b,n,d)	((c)->manager->write_blk((c),b,n,d))
 #define io_channel_flush(c) 		((c)->manager->flush((c)))
 #define io_channel_bumpcount(c)		((c)->refcount++)
@@ -92,6 +98,10 @@ extern errcode_t io_channel_set_options(
 extern errcode_t io_channel_write_byte(io_channel channel, 
 				       unsigned long offset,
 				       int count, const void *data);
+extern errcode_t readahead_noop(io_channel channel, unsigned long block,
+				int count);
+extern errcode_t cache_release_noop(io_channel channel, unsigned long block,
+				    int count);
 
 /* unix_io.c */
 extern io_manager unix_io_manager;
--- e2fsprogs-1.40.4.orig/lib/ext2fs/unix_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/unix_io.c
@@ -17,6 +17,10 @@
 
 #define _LARGEFILE_SOURCE
 #define _LARGEFILE64_SOURCE
+/* Below needed for fadvise */
+#define _XOPEN_SOURCE 600
+/* Without this, fadvise goes 32 bit? */
+#define _FILE_OFFSET_BITS 64
 
 #include <stdio.h>
 #include <string.h>
@@ -77,6 +81,10 @@ static errcode_t unix_close(io_channel c
 static errcode_t unix_set_blksize(io_channel channel, int blksize);
 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
 			       int count, void *data);
+static errcode_t unix_readahead(io_channel channel, unsigned long block,
+				int count);
+static errcode_t unix_cache_release(io_channel channel, unsigned long block,
+				    int count);
 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
 				int count, const void *data);
 static errcode_t unix_flush(io_channel channel);
@@ -103,6 +111,8 @@ static struct struct_io_manager struct_u
 	unix_close,
 	unix_set_blksize,
 	unix_read_blk,
+	unix_readahead,
+	unix_cache_release,
 	unix_write_blk,
 	unix_flush,
 #ifdef NEED_BOUNCE_BUFFER
@@ -587,6 +597,37 @@ static errcode_t unix_read_blk(io_channe
 #endif /* NO_IO_CACHE */
 }
 
+static errcode_t unix_readahead(io_channel channel, unsigned long block,
+				int count)
+{
+	struct unix_private_data *data;
+
+	data = (struct unix_private_data *)channel->private_data;
+#if DEBUG
+	printf(" RA: %s: readahead %lu, %d blocks\n", __FUNCTION__,
+	       block, count);
+#endif
+	posix_fadvise(data->dev, (ext2_loff_t)block * channel->block_size,
+		      (ext2_loff_t)count * channel->block_size,
+		      POSIX_FADV_WILLNEED);
+	return 0;
+}
+
+static errcode_t unix_cache_release(io_channel channel, unsigned long block,
+				    int count)
+{
+	struct unix_private_data *data;
+
+	data = (struct unix_private_data *)channel->private_data;
+#if DEBUG
+	printf("MT: %s: release %lu, %d blocks\n", __FUNCTION__, block, count);
+#endif
+	posix_fadvise(data->dev, (ext2_loff_t)block * channel->block_size,
+		      (ext2_loff_t)count * channel->block_size,
+		      POSIX_FADV_DONTNEED);
+	return 0;
+}
+
 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
 				int count, const void *buf)
 {
--- e2fsprogs-1.40.4.orig/lib/ext2fs/inode_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/inode_io.c
@@ -64,6 +64,8 @@ static struct struct_io_manager struct_i
 	inode_close,
 	inode_set_blksize,
 	inode_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	inode_write_blk,
 	inode_flush,
 	inode_write_byte
--- e2fsprogs-1.40.4.orig/lib/ext2fs/dosio.c
+++ e2fsprogs-1.40.4/lib/ext2fs/dosio.c
@@ -64,6 +64,8 @@ static struct struct_io_manager struct_d
         dos_close,
         dos_set_blksize,
         dos_read_blk,
+        readahead_noop,
+        cache_release_noop,
         dos_write_blk,
         dos_flush
 };
--- e2fsprogs-1.40.4.orig/lib/ext2fs/nt_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/nt_io.c
@@ -236,6 +236,8 @@ static struct struct_io_manager struct_n
 	nt_close,
 	nt_set_blksize,
 	nt_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	nt_write_blk,
 	nt_flush
 };
--- e2fsprogs-1.40.4.orig/lib/ext2fs/test_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/test_io.c
@@ -74,6 +74,8 @@ static struct struct_io_manager struct_t
 	test_close,
 	test_set_blksize,
 	test_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	test_write_blk,
 	test_flush,
 	test_write_byte,
--- e2fsprogs-1.40.4.orig/lib/ext2fs/io_manager.c
+++ e2fsprogs-1.40.4/lib/ext2fs/io_manager.c
@@ -67,3 +67,15 @@ errcode_t io_channel_write_byte(io_chann
 
 	return EXT2_ET_UNIMPLEMENTED;
 }
+
+errcode_t readahead_noop(io_channel channel, unsigned long block,
+			 int count)
+{
+	return 0;
+}
+
+errcode_t cache_release_noop(io_channel channel, unsigned long block,
+			 int count)
+{
+	return 0;
+}
--- e2fsprogs-1.40.4.orig/e2fsck/Makefile.in
+++ e2fsprogs-1.40.4/e2fsck/Makefile.in
@@ -119,16 +119,16 @@ e2fsck: e2fsck.@E2FSCK_TYPE@
 e2fsck.static: $(OBJS)  $(STATIC_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) $(LDFLAG_STATIC) -o e2fsck.static $(OBJS) \
-		$(STATIC_LIBS) 
+		$(STATIC_LIBS) -lpthread
 
 e2fsck.shared: $(OBJS)  $(DEPLIBS)
 	@echo "	LD $@"
-	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS) 
+	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS)  -lpthread
 
 e2fsck.profiled: $(PROFILED_OBJS)  $(PROFILED_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) -g -pg -o e2fsck.profiled $(PROFILED_OBJS) \
-		$(PROFILED_LIBS) 
+		$(PROFILED_LIBS)  -lpthread
 
 tst_refcount: ea_refcount.c
 	@echo "	LD $@"
--- e2fsprogs-1.40.4.orig/e2fsck/e2fsck.h
+++ e2fsprogs-1.40.4/e2fsck/e2fsck.h
@@ -334,6 +334,11 @@ struct e2fsck_struct {
 	profile_t	profile;
 	int blocks_per_page;
 
+ 	/*
+ 	 * Readahead state
+ 	 */
+ 	struct readahead_state *ra;
+
 	/*
 	 * For the use of callers of the e2fsck functions; not used by
 	 * e2fsck functions themselves.
--- e2fsprogs-1.40.4.orig/e2fsck/pass1.c
+++ e2fsprogs-1.40.4/e2fsck/pass1.c
@@ -463,6 +463,25 @@ extern void e2fsck_setup_tdb_icount(e2fs
 		*ret = 0;
 }
 
+/*
+ * Should we submit this inode for readahead?
+ */
+
+static int readahead_this_inode(struct ext2_inode *inode)
+{
+	/* Does this inode have valid blocks (i.e., not a fast symlink)? */
+	if (ext2fs_inode_has_valid_blocks(inode) &&
+	    /* Is it a directory or regular file? */
+	    (LINUX_S_ISDIR(inode->i_mode) ||
+	     LINUX_S_ISREG(inode->i_mode)) &&
+	    /* And does it have indirect blocks? */
+	    (inode->i_block[EXT2_IND_BLOCK] ||
+	     inode->i_block[EXT2_DIND_BLOCK] ||
+	     inode->i_block[EXT2_TIND_BLOCK]))
+		return 1;
+	return 0;
+}
+
 void e2fsck_pass1(e2fsck_t ctx)
 {
 	int	i;
@@ -483,7 +502,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 	int		imagic_fs;
 	int		busted_fs_time = 0;
 	int		inode_size;
-	
+
 #ifdef RESOURCE_TRACK
 	init_resource_track(&rtrack);
 #endif
@@ -498,6 +517,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 			ctx->dirs_to_hash = 0;
 	}
 
+	ext2fs_readahead_inode_init(ctx->ra, readahead_this_inode);
+
 #ifdef MTRACE
 	mtrace_print("Pass 1");
 #endif
@@ -619,6 +640,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 	    (fs->super->s_mtime < fs->super->s_inodes_count))
 		busted_fs_time = 1;
 
+	ext2fs_readahead_inode_start(ctx->ra);
+
 	while (1) {
 		old_op = ehandler_operation(_("getting next inode from scan"));
 		pctx.errcode = ext2fs_get_next_inode_full(scan, &ino, 
@@ -687,6 +710,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 			}
 			ext2fs_mark_inode_bitmap(ctx->inode_used_map, ino);
 			clear_problem_context(&pctx);
+			/* Inodes are normally released in check_blocks except for this one */
+			ext2fs_readahead_inode_release(ctx->ra, fs, ino, block_buf);
 			continue;
 		} else if (ino == EXT2_ROOT_INO) {
 			/*
@@ -935,6 +960,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 	}
 	process_inodes(ctx, block_buf);
 	ext2fs_close_inode_scan(scan);
+	ext2fs_readahead_inode_exit(ctx->ra);
 
 	/*
 	 * If any extended attribute blocks' reference counts need to
@@ -1032,6 +1058,8 @@ static errcode_t scan_callback(ext2_fils
 	scan_struct = (struct scan_callback_struct *) priv_data;
 	ctx = scan_struct->ctx;
 	
+	ext2fs_readahead_inode_table_release(ctx->ra, fs, group);
+
 	process_inodes((e2fsck_t) fs->priv_data, scan_struct->block_buf);
 
 	if (ctx->progress)
@@ -1515,10 +1543,14 @@ static void check_blocks(e2fsck_t ctx, s
 	if (inode->i_file_acl && check_ext_attr(ctx, pctx, block_buf))
 		pb.num_blocks++;
 
-	if (ext2fs_inode_has_valid_blocks(inode))
+	if (ext2fs_inode_has_valid_blocks(inode)) {
 		pctx->errcode = ext2fs_block_iterate2(fs, ino,
 				       pb.is_dir ? BLOCK_FLAG_HOLE : 0,
 				       block_buf, process_block, &pb);
+		/* Release it if we read it in the first place */
+		if (readahead_this_inode(inode))
+			ext2fs_readahead_inode_release(ctx->ra, fs, ino, block_buf);
+	}
 	end_problem_latch(ctx, PR_LATCH_BLOCK);
 	end_problem_latch(ctx, PR_LATCH_TOOBIG);
 	if (ctx->flags & E2F_FLAG_SIGNAL_MASK)
--- e2fsprogs-1.40.4.orig/e2fsck/unix.c
+++ e2fsprogs-1.40.4/e2fsck/unix.c
@@ -57,6 +57,8 @@ static int normalize_swapfs;
 static int cflag;		/* check disk */
 static int show_version_only;
 static int verbose;
+static unsigned int max_ios;
+static unsigned int cache_max;
 
 static int replace_bad_blocks;
 static int keep_bad_blocks;
@@ -74,6 +76,7 @@ static void usage(e2fsck_t ctx)
 		_("Usage: %s [-panyrcdfvstDFSV] [-b superblock] [-B blocksize]\n"
 		"\t\t[-I inode_buffer_blocks] [-P process_inode_size]\n"
 		"\t\t[-l|-L bad_blocks_file] [-C fd] [-j external_journal]\n"
+		"\t\t[-A maximum_concurrent_ios] [-m maximum_memory]\n"
 		"\t\t[-E extended-options] device\n"),
 		ctx->program_name);
 
@@ -619,8 +622,11 @@ static errcode_t PRS(int argc, char *arg
 		ctx->program_name = *argv;
 	else
 		ctx->program_name = "e2fsck";
-	while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+	while ((c = getopt (argc, argv, "paA:nyrcC:B:dE:fvtFVm:M:b:I:j:P:l:L:N:SsDk")) != EOF)
 		switch (c) {
+		case 'A':
+			max_ios = atoi(optarg);
+			break;
 		case 'C':
 			ctx->progress = e2fsck_update_progress;
 			res = sscanf(optarg, "%d", &ctx->progress_fd);
@@ -727,6 +733,10 @@ static errcode_t PRS(int argc, char *arg
 		case 'V':
 			show_version_only = 1;
 			break;
+		case 'm':
+			/* XXX should handle 3M and 54kb type stuff */
+			cache_max = atoi(optarg);
+			break;
 #ifdef MTRACE
 		case 'M':
 			mallwatch = (void *) strtol(optarg, NULL, 0);
@@ -1244,7 +1254,13 @@ restart:
 	else
 		journal_size = -1;
 
+	if (ext2fs_readahead_init(ctx->fs, max_ios, cache_max, &ctx->ra))
+		printf(_("Readahead failed to start, disabled.\n"));
+
 	run_result = e2fsck_run(ctx);
+
+	ext2fs_readahead_exit(&ctx->ra);
+
 	e2fsck_clear_progbar(ctx);
 
 	if (ctx->flags & E2F_FLAG_JOURNAL_INODE) {
--- e2fsprogs-1.40.4.orig/lib/ext2fs/Makefile.in
+++ e2fsprogs-1.40.4/lib/ext2fs/Makefile.in
@@ -59,6 +59,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_O
 	openfs.o \
 	read_bb.o \
 	read_bb_file.o \
+	readahead.o \
 	res_gdt.o \
 	rw_bitmaps.o \
 	swapfs.o \
--- e2fsprogs-1.40.4.orig/lib/ext2fs/block.c
+++ e2fsprogs-1.40.4/lib/ext2fs/block.c
@@ -59,6 +59,9 @@ static int block_iterate_ind(blk_t *ind_
 		ret |= BLOCK_ERROR;
 		return ret;
 	}
+	if (ctx->flags & BLOCK_FLAG_IND_ONLY)
+		return ret;
+
 	ctx->errcode = ext2fs_read_ind_block(ctx->fs, *ind_block, 
 					     ctx->ind_buf);
 	if (ctx->errcode) {
@@ -259,7 +262,6 @@ static int block_iterate_tind(blk_t *tin
 		ret |= (*ctx->func)(ctx->fs, tind_block,
 				    BLOCK_COUNT_TIND, ref_block,
 				    ref_offset, ctx->priv_data);
-	
 	return ret;
 }
 	
@@ -342,14 +344,17 @@ errcode_t ext2fs_block_iterate2(ext2_fil
 	/*
 	 * Iterate over normal data blocks
 	 */
-	for (i = 0; i < EXT2_NDIR_BLOCKS ; i++, ctx.bcount++) {
-		if (blocks[i] || (flags & BLOCK_FLAG_APPEND)) {
-			ret |= (*ctx.func)(fs, &blocks[i],
-					    ctx.bcount, 0, i, priv_data);
-			if (ret & BLOCK_ABORT)
-				goto abort_exit;
+	if (!(flags & BLOCK_FLAG_IND_ONLY)) {
+		for (i = 0; i < EXT2_NDIR_BLOCKS ; i++, ctx.bcount++) {
+			if (blocks[i] || (flags & BLOCK_FLAG_APPEND)) {
+				ret |= (*ctx.func)(fs, &blocks[i],
+						   ctx.bcount, 0, i, priv_data);
+				if (ret & BLOCK_ABORT)
+					goto abort_exit;
+			}
 		}
 	}
+
 	if (*(blocks + EXT2_IND_BLOCK) || (flags & BLOCK_FLAG_APPEND)) {
 		ret |= block_iterate_ind(blocks + EXT2_IND_BLOCK,
 					 0, EXT2_IND_BLOCK, &ctx);
@@ -434,4 +439,3 @@ errcode_t ext2fs_block_iterate(ext2_fils
 	return ext2fs_block_iterate2(fs, ino, BLOCK_FLAG_NO_LARGE | flags,
 				     block_buf, xlate_func, &xl);
 }
-
--- e2fsprogs-1.40.4.orig/lib/ext2fs/ext2fs.h
+++ e2fsprogs-1.40.4/lib/ext2fs/ext2fs.h
@@ -278,6 +278,9 @@ struct struct_ext2_filsys {
  * BLOCK_FLAG_DATA_ONLY indicates that the iterator function should be
  * called for data blocks only.
  *
+ * BLOCK_FLAG_IND_ONLY is the opposite of BLOCK_FLAG_DATA_ONLY - only
+ * call the iterate function for indirect blocks.
+ *
  * BLOCK_FLAG_NO_LARGE is for internal use only.  It informs
  * ext2fs_block_iterate2 that large files won't be accepted.
  */
@@ -285,6 +288,7 @@ struct struct_ext2_filsys {
 #define BLOCK_FLAG_HOLE		1
 #define BLOCK_FLAG_DEPTH_TRAVERSE	2
 #define BLOCK_FLAG_DATA_ONLY	4
+#define BLOCK_FLAG_IND_ONLY	8
 
 #define BLOCK_FLAG_NO_LARGE	0x1000
 
@@ -808,6 +812,8 @@ extern errcode_t ext2fs_get_next_inode_f
 					    ext2_ino_t *ino,
 					    struct ext2_inode *inode, 
 					    int bufsize);
+errcode_t ext2fs_get_next_inode_ptr(ext2_inode_scan scan, ext2_ino_t *ino,
+				    struct ext2_inode **inode);
 extern errcode_t ext2fs_open_inode_scan(ext2_filsys fs, int buffer_blocks,
 				  ext2_inode_scan *ret_scan);
 extern void ext2fs_close_inode_scan(ext2_inode_scan scan);
@@ -907,6 +913,42 @@ errcode_t ext2fs_link(ext2_filsys fs, ex
 errcode_t ext2fs_unlink(ext2_filsys fs, ext2_ino_t dir, const char *name,
 			ext2_ino_t ino, int flags);
 
+/* readahead.c */
+
+struct readahead_state;
+
+/* Generic readahead start/stop functions */
+
+errcode_t ext2fs_readahead_init(ext2_filsys fs, unsigned int max_ios,
+				unsigned int max_mem,
+				struct readahead_state **ret_ra);
+void ext2fs_readahead_exit(struct readahead_state **ret_ra);
+void ext2fs_readahead_kill(struct readahead_state *ra);
+
+/* Inode readahead (pass 1) */
+
+void ext2fs_readahead_inode_init(struct readahead_state *ra,
+				 int (*should_read_inode)(struct ext2_inode *));
+void ext2fs_readahead_inode_start(struct readahead_state *ra);
+void ext2fs_readahead_inode_exit(struct readahead_state *ra);
+void ext2fs_readahead_inode_release(struct readahead_state *ra, ext2_filsys fs,
+				    ext2_ino_t ino, char *block_buf);
+void ext2fs_readahead_inode_table_release(struct readahead_state *ra,
+					  ext2_filsys fs, dgrp_t group);
+/* Directory block readahead (pass 2) */
+
+void ext2fs_readahead_dblist_init(struct readahead_state *ra,
+				  ext2_dblist dblist);
+void ext2fs_readahead_dblist_start(struct readahead_state *ra);
+void ext2fs_readahead_dblist_exit(struct readahead_state *ra);
+void ext2fs_readahead_dblist_release(struct readahead_state *ra,
+				     ext2_filsys fs, blk_t blk);
+
+/* Private to readahead implementation */
+
+void * readahead_inode_loop(void *arg);
+void * readahead_dblist(void *arg);
+
 /* read_bb.c */
 extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
 				      ext2_badblocks_list *bb_list);
--- /dev/null
+++ e2fsprogs-1.40.4/lib/ext2fs/readahead.c
@@ -0,0 +1,1607 @@
+/*
+ * readahead.c --- Fast, parallel readahead of file system structures
+ *
+ * Copyright (C) 2007-2008 Valerie Henson <val@nmt.edu>
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ *
+ * Generic readahead functions for users of ext2 lib.
+ *
+ * The general theory of readahead is that the main thread (fsck in
+ * most cases) spawns a second thread, the readahead thread, to go
+ * preread the file system structures.  The readahead thread relies on
+ * the buffer cache to store the blocks it has pre-read - it's sort of
+ * managing the buffer cache blindfolded.  Similarly, the readahead
+ * thread also keeps track of how many ios it has submittted to the
+ * device which have not yet completed, so it can avoid overflowing
+ * the device io queue.  This is also done without any real
+ * communication with the actual device queue; we're just guessing for
+ * the most part.
+ *
+ * This file is divided into calls made by the readahead thread and
+ * calls made by the main thread.
+ *
+ * Readahead is turned off by default, as it gives minimal or zero
+ * performance improvement on single disk systems, which make up the
+ * majority of file systems in Linux.
+ *
+ * Cache management:
+ *
+ * The total amount of cache to use is set by a command-line option.
+ * This should be a little bit smaller than the maximum memory
+ * available for buffer cache - i.e., not all of free memory, since
+ * buffer cache is usually limited to some fraction of all memory.
+ * Setting the maximum cache to greater than the buffer cache will
+ * approximately double elapsed time, since all of the file system
+ * data will be read from disk twice - once by readahead, and after it
+ * has been evicted by other pre-read blocks, a second time by the
+ * main thread.  This is bad.  Currently, I figure out what the buffer
+ * cache maximum is by running vmstat and reading the "buff" field.
+ *
+ * The *_readahead functions are called by the readahead thread and
+ * queue up ios to be pre-read.  When the io is actually issued and
+ * completed, the readahead thread increases the number of blocks
+ * available in cache.  The main thread (the client) calls the
+ * ext2fs_readahead_*_release functions (in readahead_client.c),
+ * reducing the number of blocks marked as in the cache and marking
+ * them as WONTNEED using fadvise (probably unnecessary and a no-op,
+ * as normal block aging should evict them from cache as new blocks
+ * are read).
+ *
+ * The threads sleep and wake according to the size of the cache, with
+ * the readahead thread as the producer and the main thread as the
+ * consumer.  The readahead thread wakes the main thread whenever a
+ * significant amount of cache is available, and wakes and waits on
+ * the main thread if the cache is full.  The main thread wakes the
+ * readahead thread whenever a significant amount of cache becomes
+ * free, and wakes and waits on the readahead thread whenever the
+ * cache becomes empty.  The threads only signal each other when a
+ * significant amount of cache becomes free or available because
+ * otherwise the threads can get locked in a tight wake/sleep cycle in
+ * which the readahead thread never has the opportunity to issue
+ * multiple ios in parallel.  Another small modification is that the
+ * readahead thread records how much cache it needs to complete the
+ * next io before it goes to sleep, so the main thread won't wake it
+ * unless there is enough cache to satisfy the next io.
+ *
+ * One major pitfall of this approach is that it requires very careful
+ * accounting of blocks read.  The main thread must call the release
+ * function for every block that the readahead thread adds to the
+ * cache.  In particular, the main thread must release the indirect
+ * blocks for every inode that the readahead thread walks.  The main
+ * thread may not want to readahead every inode in the file system, so
+ * the interface allows the client to pass a function used to decide
+ * which inodes to readahead.
+ *
+ * Note that the main thread is allowed to pull slightly ahead of the
+ * readahead thread (and allow the cache used accounting to go
+ * negative) because it's hard to predict how many blocks the main
+ * thread will read when it's checking an inode.  Checking before
+ * every block read would be a lot of overhead, so checking and
+ * blocking if necessary is done when the main thread is entirely done
+ * with an inode.
+ *
+ * We try to keep the granularity of adding or releasing things to the
+ * cache large, so that we don't grab locks and send signals and
+ * switch threads for every block that enters or leaves the cache.
+ *
+ * Asynchronous IO implementation:
+ *
+ * Asynchronous, parallel IO is implemented using primitives that are
+ * available, well-tested, and actually do something in most deployed
+ * Linux systems:
+ *
+ * Start io: posix_fadvise (WILLNEED)
+ * Complete io: read() into scratch buffer
+ *
+ * Amazingly, this works - and it goes fast.  At this point in time,
+ * the only other serious alternative is to spawn multiple threads and
+ * do synchronous ios from the thread.  No other forms of aio for
+ * Linux satisfy the above three criteria; in particular, POSIX aio is
+ * implemented as synchronous IO on most systems.
+ *
+ * Device queue management:
+ *
+ * As with the cache, we assume we have pretty much exclusive use of
+ * the device containing the file system, and try to manage the queue
+ * blind.  The io queue size is currently passed in from the user, but
+ * could be found programmatically with some effort.  Ios are
+ * considered outstanding after we start readahead until we read the
+ * actual data requested.  We try to collect at least max_ios before
+ * sorting and issuing them, to get optimal parallelization and head
+ * movement.
+ *
+ * This code replicates some portion of the in-kernel device queue
+ * management - queue plugging, merging, sorting, etc.  Why not just
+ * let the kernel handle it?  The answer is that we can predict what
+ * blocks will be needed, and we know about block dependencies.  The
+ * block layer has to guess what will happen next, but we have more
+ * information than we can pass down through the system call
+ * interface.  That said, it's certainly possible that the performance
+ * difference will be negligible on some systems.
+ *
+ * Thread management:
+ *
+ * Each pass of readahead (inode, direct block, etc.) creates and
+ * destroys its own thread.  The thread isn't created and doesn't
+ * start until the *_start() function is called, so the main thread
+ * controls when readahead starts working.  Some generic helper
+ * functions are provided; the per-pass initialization only has to
+ * initialize its own private variables.
+ *
+ * TODO:
+ *
+ * - acls and xattrs are ignored, but should also be read.
+ * - The user interface is less than stellar, esp. for maximum cache memory
+ * - autoconf the thread stuff and all of readahead
+ * - Implement main thread timeout for safety
+ * - Replace dummy block hack to keep main thread from hanging
+ * - Use pthread_cleanup handlers when holding locks to prevent
+ *   deadlock in the case of the thread dying while holding a lock
+ * - mincore() instead of read()?
+ * - parallelize inode table readahead
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <pthread.h>
+#include <error.h>
+#include <errno.h>
+
+#include "ext2_fs.h"
+#include "ext2fs.h"
+#include "ext2fsP.h"
+
+/* XXX should use ext2 debug routines? */
+
+/* #define EXT2FS_READAHEAD_DEBUG */
+
+#ifdef EXT2FS_READAHEAD_DEBUG
+#define dprintf			printf
+#else
+#define dprintf(args...)	do { } while (0)
+#endif
+
+struct io_desc {
+	blk_t	blk;
+	unsigned int count;
+};
+
+struct readahead_state {
+	ext2_filsys	fs;
+	int		enabled;
+
+	/*
+	 * Pass-independent IO and cache management.
+	 */
+
+	/* Maximum number of simultaneous ios disk can handle */
+	unsigned int	max_ios;
+	/* Maximum io size - to prevent deadlocks waiting for cache */
+	unsigned int	max_io_size;
+	/* Kick off outstanding ios after this many msecs idle */
+	unsigned int	timeout;
+	/* Should be at least max_ios in size */
+	unsigned int	io_queue_size;
+	/* Blocks to read queue */
+	struct io_desc	*io_queue;
+	/* Ios in the queue */
+	unsigned int	ios_queued;
+	/* Blocks in the queue (but not issued) */
+	unsigned int	blocks_queued;
+	/* All cache size variables are in units of file system blocks */
+	/* Maximum buffer cache to use */
+	int		cache_max;
+	/* Buffer cache used by pre-read blocks */
+	int		cache_used;
+	/* Cache reserved for in-progress readaheads */
+	int		cache_pending;
+	/* Cache blocks needed for waiting readahead thread to progress */
+	int		cache_needed;
+	/* Wake readahead when cache used gets this low */
+	unsigned int	cache_restart_ra;
+	/* Wake main thread ahead when cache used gets this high */
+	unsigned int	cache_restart_main;
+	/* Protects shared cache variables */
+	pthread_mutex_t	cache_mutex;
+	/* Condition variable for thread wake/sleep */
+	pthread_cond_t	cache_wake;
+	/* Is readahead done? Then don't wait on zero cache */
+	int		cache_readahead_done;
+	/*
+	 * Scratch buffer for reading a single throwaway block.  Never
+	 * ever read the contents.
+	 */
+	char		*scratch_buf;
+	/* Readahead thread struct; only need one of them */
+	pthread_t		thread;
+	/* Pass-specific exit handler */
+	void		(*thread_exit)(struct readahead_state *);
+	/* Should we exit now? Check frequently */
+	int		should_exit;
+
+	/*
+	 * Inode readahead related state
+	 */
+
+	/*
+	 * Indirect blocks are traversed recursively and need one
+	 * block buffer for double and triple blocks (singles use the
+	 * scratch_buf).
+	 */
+	char		*double_buf;
+	char		*triple_buf;
+	/* We do our own internal inode scan */
+	ext2_inode_scan scan;
+	/* Caller provided function to decide which inodes to read ahead */
+	int		(*should_read_inode)(struct ext2_inode *);
+
+	/*
+	 * Directory block readahead related state
+	 */
+
+	/* List of directory blocks */
+	ext2_dblist dblist;
+};
+
+static void readahead_test_exit(struct readahead_state *ra);
+static void readhead_thread_exit_now(struct readahead_state *ra);
+
+/*
+ * Cache management routines.
+ */
+
+/*
+ * Sanity check cache variables and kill readahead if things look
+ * wonky.  Caller must not hold the cache mutex.
+ */
+
+static void check_cache(struct readahead_state *ra)
+{
+	if (ra->cache_used + ra->cache_pending > ra->cache_max) {
+		fprintf(stderr, "Bug! cache_used (%d) + cache_pending (%d) is "
+			"greater than cache_max (%d)\n",
+			ra->cache_used, ra->cache_pending, ra->cache_max);
+#ifdef EXT2FS_READAHEAD_DEBUG
+		abort();
+#endif
+		readhead_thread_exit_now(ra);
+	}
+}
+
+/*
+ * Do we have room in the cache for this io?
+ */
+
+static int cache_full(struct readahead_state *ra, unsigned int blks)
+{
+	/* No lock needed - main thread can only decrease cache_used */
+	if (ra->cache_used + ra->cache_pending + blks > ra->cache_max) {
+		dprintf(" RA: %s: cache_used %d cache_pending %d blks %u\n",
+			__FUNCTION__, ra->cache_used, ra->cache_pending, blks);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Wait until cache becomes available.
+ */
+
+static void wait_for_cache(struct readahead_state *ra, unsigned int blks)
+{
+	check_cache(ra);
+	pthread_mutex_lock(&ra->cache_mutex);
+	/*
+	 * The main thread could have freed up cache since the call to
+	 * cache_full(), check to see if we already have space
+	 * available.
+	 */
+	if (ra->cache_used + ra->cache_pending + blks <= ra->cache_max) {
+		pthread_mutex_unlock(&ra->cache_mutex);
+		return;
+	}
+	/* Guess not.  Wait for it. */
+	ra->cache_needed = blks;
+	dprintf(" RA: %s: Need cache, waking main thread, used %d needed %d\n",
+		__FUNCTION__, ra->cache_used, ra->cache_needed);
+	pthread_cond_signal(&ra->cache_wake);
+	dprintf(" RA: %s: Waiting for main thread to free cache...\n",
+		__FUNCTION__);
+	pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+	/* cache_needed is reset by the main thread */
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/* See if we were actually woken by request to exit */
+	readahead_test_exit(ra);
+}
+
+/*
+ * Track blocks read into buffer cache and ready for the main thread
+ * to use.
+ */
+
+static void blocks_ready(struct readahead_state *ra, unsigned int count)
+{
+	check_cache(ra);
+	pthread_mutex_lock(&ra->cache_mutex);
+	dprintf(" RA: %s: cache_pending %d cache_used %d count %u\n",
+		__FUNCTION__, ra->cache_pending, ra->cache_used, count);
+	ra->cache_pending -= count;
+	ra->cache_used += count;
+	/* Wake main thread if we crossed the boundary */
+	if ((ra->cache_used - count < ra->cache_restart_main) &&
+	    (ra->cache_used >= ra->cache_restart_main)) {
+		dprintf(" RA: %s: Cache filling up, waking main thread "
+			"(used %d)\n", __FUNCTION__, ra->cache_used);
+		pthread_cond_signal(&ra->cache_wake);
+	}
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * IO queue management routines.
+ */
+
+#ifdef EXT2FS_READAHEAD_DEBUG
+
+/*
+ * Sanity check the io queue.
+ */
+
+static void check_queue(struct readahead_state *ra)
+{
+	struct io_desc *io;
+	int i;
+
+	if (ra->ios_queued > ra->io_queue_size) {
+		fprintf(stderr, "Bug!  IOs queued %u > IO queue size %u\n",
+			ra->ios_queued, ra->io_queue_size);
+		goto out;
+	}
+
+	for (i = 0; i < ra->io_queue_size; i++) {
+		io = &ra->io_queue[i];
+		/* Only print non-zero ios for brevity */
+		if (io->blk)
+			dprintf("  io[%d] blk %u count %u\n", i, io->blk,
+			       io->count);
+		if ((io->blk || io->count) &&
+		    (i >= ra->ios_queued)) {
+			fprintf(stderr, "Bug!  io[%d] lost! "
+				"(blk %u count %u)\n", i, io->blk, io->count);
+			goto out;
+		}
+	}
+
+	return;
+
+ out:
+#ifdef EXT2FS_READAHEAD_DEBUG
+	abort();
+#endif
+	readhead_thread_exit_now(ra);
+}
+#else
+#define check_queue(x) do { } while(0);
+#endif
+
+/*
+ * Compare function for sorting the io queue.  We sort unused (zero)
+ * ios created by merge_ios() to the end.
+ */
+
+static EXT2_QSORT_TYPE io_compare(const void *a, const void *b)
+{
+	struct io_desc *io_a = (struct io_desc *) a;
+	struct io_desc *io_b = (struct io_desc *) b;
+
+	if ((io_a->blk == 0) &&
+	    (io_b->blk == 0))
+		/* Don't care */
+		return 0;
+	/*
+	 * Make zeroes bigger than everything else so they move to the
+	 * end of the queue.
+	 */
+	if (io_a->blk == 0)
+		return 1;
+
+	if (io_b->blk == 0)
+		return -1;
+
+	return io_a->blk - io_b->blk;
+}
+
+/*
+ * Merge ios if they are contiguous while respecting maximum IO size.
+ */
+
+static void merge_ios(struct readahead_state *ra)
+{
+	/* merge_ios not to be called with less than one io */
+	unsigned int new_ios = 1;
+	struct io_desc *last_io;
+	struct io_desc *new_io;
+	blk_t last_blk;
+	blk_t new_last_blk;
+	int i;
+
+	qsort(ra->io_queue, ra->ios_queued, sizeof(ra->io_queue[0]),
+	      io_compare);
+
+	/* Set last_io to the first io... */
+	last_io = &ra->io_queue[0];
+	new_ios = 1;
+	/* Then start merging at second io */
+	for (i = 1; i < ra->ios_queued; i++) {
+		dprintf(" RA: %s: io[%d]\n", __FUNCTION__, i);
+		new_io = &ra->io_queue[i];
+		last_blk = last_io->blk + last_io->count;
+		/*
+		 * Merge ios if both (1) the start of the new io is <=
+		 * to the end of the last io, and (2) the new io would
+		 * not be bigger than the maximum allowed io.
+		 */
+		if ((new_io->blk <= last_blk) &&
+		    ((new_io->count + last_io->count) <= ra->max_io_size)) {
+			dprintf(" RA: %s: io merged! [%u,%u] + [%u,%u] = ",
+				__FUNCTION__, last_io->blk, last_io->count,
+				new_io->blk, new_io->count);
+			new_last_blk = new_io->blk + new_io->count;
+			/*
+			 * Be careful.  The end of this io could be
+			 * less than the end of last io.
+			 */
+			if (new_last_blk > last_blk)
+				last_io->count = new_last_blk - last_io->blk;
+			/* Clear the merged io */
+			new_io->blk = 0;
+			new_io->count = 0;
+			dprintf("[%u,%u]\n", last_io->blk, last_io->count);
+		} else {
+			dprintf(" RA: Not merged: [%u,%u] and [%u,%u]\n",
+				last_io->blk, last_io->count,
+				new_io->blk, new_io->count);
+			/* This is our new last io */
+			last_io = new_io;
+			new_ios++;
+		}
+	}
+
+	/* Resort to move zeroes to the end */
+	qsort(ra->io_queue, ra->ios_queued, sizeof(ra->io_queue[0]),
+	      io_compare);
+
+	check_queue(ra);
+
+	ra->ios_queued = new_ios;
+}
+
+/*
+ * Ios can be bad for at least two reasons:
+ *
+ * - corrupted on-disk block pointer (can't prevent this)
+ * - programming error (a.k.a. "bug")
+ *
+ * Check for and reject obviously bad ios before they are issued or
+ * cache is reserved for them.
+ */
+
+static int reject_io(struct readahead_state *ra, blk_t blk, unsigned int count)
+{
+	blk_t max_blk = ra->fs->super->s_blocks_count - 1;
+
+	/* Is the blk number within the bounds of the file system? */
+	if ((blk == 0) || (blk + count > max_blk)) {
+		fprintf(stderr, "Bad io: blk %u count %u\n", blk, count);
+#ifdef EXT2FS_READAHEAD_DEBUG
+		abort();
+#endif
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Complete issued ios to get them out of the device's io queue.
+ */
+
+static void reap_ios(struct readahead_state *ra, unsigned int start,
+		     unsigned int count)
+{
+	unsigned int blocks_reaped = 0;
+	struct io_desc *io;
+	int i;
+
+	dprintf(" RA: %s: start %u count %u\n", __FUNCTION__, start, count);
+	for (i = start; i < start + count; i++) {
+		io = &ra->io_queue[i];
+		if (reject_io(ra, io->blk, io->count))
+			continue;
+		/*
+		 * Read blocks into the scratch buffer one io at a
+		 * time and throw them away (scratch buffer is big
+		 * enough for the maximum possible io).  Wastes memory
+		 * bandwidth, but I sure hope that's not our
+		 * bottleneck.  Also, passing the memory back to the
+		 * main thread is painful, and keeping the blocks in
+		 * memory would result in double-buffering for
+		 * everything in the cache anyway.
+		 */
+		io_channel_read_blk(ra->fs->io, io->blk, io->count,
+				    ra->scratch_buf);
+		blocks_reaped += io->count;
+		/*
+		 * Only release blocks to the cache when we have a lot
+		 * of them to avoid excessive lock and signal
+		 * overhead.
+		 */
+		/* XXX should be based on high/low watermarks */
+		if (blocks_reaped > (ra->cache_max / 10)) {
+			blocks_ready(ra, blocks_reaped);
+			blocks_reaped = 0;
+		}
+		/* Clear the io descriptor for debugging */
+		io->blk = 0;
+		io->count = 0;
+		/*
+		 * Frequent checking for an exit request is especially
+		 * important when doing IO.
+		 */
+		readahead_test_exit(ra);
+	}
+	blocks_ready(ra, blocks_reaped);
+	blocks_reaped = 0;
+}
+
+/*
+ * Given an array of ios, sort, issue, and wait for them.
+ *
+ * Device queue management is important.  We want to avoid overflowing
+ * the device queue and getting our readahead requests thrown away, so
+ * we need to know whether the previous readahead ios have completed.
+ * At the same time, we want to issue max_ios number of sorted IOs in
+ * parallel.  So we collect ios at user level and wait until (1) the
+ * queue is full, (2) we need a synchronous read (as for double/triple
+ * indirect blocks or inode tables), or (3) we timeout.  Then we issue
+ * the readahead requests, up to max_ios at a time.  We then do a
+ * synchronous read of all the outstanding ios to make sure they have
+ * completed before issuing the next set of ios.  Poor man's aio.
+ */
+
+static void issue_ios(struct readahead_state *ra)
+{
+	unsigned int ios_in_flight = 0;
+	struct io_desc *io;
+	int ios_finished = 0;
+	int i;
+
+	merge_ios(ra);
+
+	dprintf(" RA: %s: ios_queued %u blocks_queued %u after merge\n",
+		__FUNCTION__, ra->ios_queued, ra->blocks_queued);
+	/*
+	 * Issue up to max ios at a time.  If necessary, wake the main
+	 * thread and wait for cache to become available.
+	 */
+	for (i = 0; i < ra->ios_queued; i++) {
+		io = &ra->io_queue[i];
+		/* Check io and cache limits */
+		if ((ios_in_flight == ra->max_ios) ||
+		    cache_full(ra, io->count)) {
+			/* Cache won't overflow if we issue current ios */
+			reap_ios(ra, ios_finished, ios_in_flight);
+			ios_in_flight = 0;
+			ios_finished = i;
+		}
+		/* Check that we have enough cache */
+		wait_for_cache(ra, io->count);
+		/* Now we have enough cache and a free io slot */
+		dprintf(" RA: %s: issuing io %d blk %u count %d\n",
+			__FUNCTION__, i, io->blk, io->count);
+		io_channel_readahead(ra->fs->io, io->blk, io->count);
+		/* Reserve cache for this io */
+		ra->cache_pending += io->count;
+		ios_in_flight++;
+		readahead_test_exit(ra);
+	}
+	/* Finish off the last of the ios */
+	if (ios_in_flight)
+		reap_ios(ra, ios_finished, ios_in_flight);
+	ra->ios_queued = 0;
+	ra->blocks_queued = 0;
+}
+
+/*
+ * Issue any outstanding ios and wake main thread one more time.
+ * Called before thread exits.
+ */
+
+static void finish_ios(struct readahead_state *ra)
+{
+	/* Kick off any left-over ios */
+	if (ra->ios_queued)
+		issue_ios(ra);
+
+	/* Wake main thread one more time in case it's waiting on us */
+	pthread_mutex_lock(&ra->cache_mutex);
+	/*
+	 * XXX HACK: Prevent main thread hanging on zero cache by
+	 * adding a dummy block to the cache.  Will deadlock if we
+	 * have a cache accounting bug that makes cache_used go
+	 * negative.
+	 */
+	ra->cache_used += 1;
+	dprintf(" RA: %s: Readahead finished, waking main thread "
+		"cache_used %d\n", __FUNCTION__, ra->cache_used);
+	pthread_cond_signal(&ra->cache_wake);
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * Check if the io queue is full.  If it is, try to squish the ios
+ * together to free up io slots.
+ */
+
+static int queue_full(struct readahead_state *ra)
+{
+	if (ra->ios_queued == ra->io_queue_size)
+		merge_ios(ra);
+	if (ra->ios_queued == ra->io_queue_size)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Add an io to the readahead queue, issuing ios if necessary to make
+ * space in the queue.
+ */
+
+static void queue_blks(struct readahead_state *ra, blk_t blk,
+		       unsigned int count)
+{
+	unsigned int blks_to_queue;
+	struct io_desc *io;
+
+	dprintf(" RA: %s: queue %u, blk %u, count %d\n", __FUNCTION__,
+		ra->ios_queued, blk, count);
+
+	if (reject_io(ra, blk, count))
+		return;
+
+	/* Break down large requests into max_io_size chunks */
+	while (count) {
+		if (queue_full(ra))
+			issue_ios(ra);
+		io = &ra->io_queue[ra->ios_queued];
+		blks_to_queue = count > ra->max_io_size ?
+			ra->max_io_size : count;
+		io->blk = blk;
+		io->count = blks_to_queue;
+		/* printf("%s: queued blk %u count %u\n", __FUNCTION__, io->blk,
+		   io->count); */
+		ra->blocks_queued += count;
+		ra->ios_queued++;
+		count -= blks_to_queue;
+		blk += blks_to_queue;
+	}
+}
+
+/*
+ * Read requested blocks, issuing any other waiting ios while we're at
+ * it.
+ */
+
+static void read_blks(struct readahead_state *ra, blk_t blk,
+		      unsigned int count, char *buf)
+{
+	dprintf(" RA: %s: blk %u count %u\n", __FUNCTION__, blk, count);
+
+	if (reject_io(ra, blk, count))
+		return;
+
+	/* Add to readahead queue */
+	queue_blks(ra, blk, count);
+	/*
+	 * Since we have to go to disk anyway, kick off and complete
+	 * all outstanding ios.
+	 */
+	issue_ios(ra);
+	/*
+	 * Retrieve specific data requested.  The cache used by this
+	 * block is already accounted for.
+	 */
+	io_channel_read_blk(ra->fs->io, blk, count, buf);
+}
+
+/*
+ * Inode table and indirect block readahead routines.
+ */
+
+/*
+ * Given a triple, double, or single indirect block, recursively
+ * traverse the indirect block tree.  Read triples and doubles
+ * immediately, but just add singles to the main block queue and read
+ * them at our convenience.
+ */
+
+static void readahead_ind_blk(struct readahead_state *ra, blk_t blk,
+			      int level, int print_header)
+{
+	unsigned int limit = ra->fs->blocksize >> 2;
+	blk_t *blk_ptr;
+	char *buf;
+	int i;
+
+	if (print_header)
+		dprintf(" RA: %*s%s: blk %u level %d\n", level * 4, "",
+			__FUNCTION__, blk, level);
+	else if (level != 0)
+		/* Start a new line */
+		dprintf("\n");
+
+	if (reject_io(ra, blk, 1))
+		return;
+
+	/* Single indirect block?  Just queue it up and return. */
+	if (level == 0) {
+		queue_blks(ra, blk, 1);
+		return;
+	}
+
+	/* Read double/triple immediately */
+	if (level == 1)
+		buf = ra->double_buf;
+	else
+		buf = ra->triple_buf;
+
+	read_blks(ra, blk, 1, buf);
+
+	/* Iterate through the block pointers */
+	blk_ptr = (blk_t *) buf;
+	dprintf(" RA: %*s%s(%d): read block %u into buf %p\n",
+		level * 4, "", __FUNCTION__, level, blk, buf);
+
+	for (i = 0; i < limit; i++) {
+		if (blk_ptr[i] != 0)
+			readahead_ind_blk(ra, blk_ptr[i], level - 1, 0);
+	}
+}
+
+/*
+ * Perform breadth-first traversal on the indirect blocks of the inode.
+ */
+
+static void readahead_inode(struct readahead_state *ra,
+			    struct ext2_inode *inode,
+			    ext2_ino_t ino)
+{
+	dprintf("%s: %d\n", __FUNCTION__, ino);
+
+	/* XXX Read acls and xattrs too */
+
+	if (inode->i_block[EXT2_IND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_IND_BLOCK], 0, 1);
+	if (inode->i_block[EXT2_DIND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_DIND_BLOCK], 1, 1);
+	if (inode->i_block[EXT2_TIND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_TIND_BLOCK], 2, 1);
+}
+
+/*
+ * Start readahead for a set of inode tables.
+ *
+ * XXX Inode table readahead really, really ought to be parallelized,
+ * but the cache and io slot management is pretty hairy.  Why?
+ *
+ * - You can't just add the blocks to the cache for more than one
+ * inode table at once because the main thread will think the indirect
+ * blocks for the current inode table are ready and go read them -
+ * which will kill performance because reading indirect blocks out of
+ * order is pretty fatally slow.
+ *
+ * - You have to reserve some cache for indirect block reads for the
+ * current block group to complete or else readahead won't be able to
+ * progress, since all the cache will be locked up as pending and it
+ * will never make any of it available to the main thread.  To merely
+ * prevent deadlocks, you must reserve at least enough blocks for the
+ * maximum io size.  To make it perform well, you have to reserve more
+ * than that.  How much is enough?  Don't know.
+ *
+ * - To make sure you can read inode tables in parallel, you want to
+ * make sure you have enough io slots free.  Again, how many?  Should
+ * they be always reserved or can indirect block reads be able to use
+ * them?  Should you just issue all outstanding ios when you start
+ * reading a block group?  But then you have to change issue_ios so it
+ * doesn't mark the blocks as available in cache.
+ *
+ * Possible solutions:
+ *
+ * - Track inode table blocks and indirect blocks separately.  Messes
+ * up nice clean generic cache interface.
+ *
+ * - Track current inode number and don't let main thread process
+ * beyond that.  Have to worry about cache/inode progress deadlocks.
+ *
+ * - Reserve cache/io for inode table readaheads.  Danger of deadlocks
+ * waiting for cache waiting for io waiting for cache, etc..  How much
+ * to reserve not clear.
+ */
+
+static void readahead_inode_tables(struct readahead_state *ra, dgrp_t group,
+				  int count)
+{
+	int i;
+
+	dprintf(" RA: %s: Starting groups %u-%u cache_used %u\n",
+		__FUNCTION__, group, group + count - 1, ra->cache_used);
+
+	for (i = 0; i < count; i++)
+		queue_blks(ra, ra->fs->group_desc[group + i].bg_inode_table,
+			   ra->fs->inode_blocks_per_group);
+	/*
+	 * Kick off the io immediately since we'll need the blocks
+	 * right away.
+	 */
+	issue_ios(ra);
+}
+
+/*
+ * At the end of each block group, issue readahead for the next block
+ * group(s).
+ */
+
+static errcode_t readahead_scan_callback(ext2_filsys fs,
+			       ext2_inode_scan scan EXT2FS_ATTR((unused)),
+			       dgrp_t group, void * priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+	dgrp_t next_group = group + 1;
+
+	dprintf(" RA: %s: Finished group %u cache_used %d\n", __FUNCTION__,
+		group, ra->cache_used);
+
+	/* Any more blockgroups? */
+	if (next_group == fs->group_desc_count)
+		return 0;
+
+	/* Start readahead for next inode table(s) */
+	/* XXX count is always 1 for now - see comment */
+	readahead_inode_tables(ra, next_group, 1);
+	return 0;
+}
+
+/*
+ * Theory of readahead thread exit/kill:
+ *
+ * The readahead thread will end for one of two reasons: (1) This
+ * particular pass is successfully completed, (2) The main thread
+ * wants to kill readahead entirely because some pass was aborted.
+ * When either of these things happens, we need to exit the readahead
+ * thread safely and then run the exit function for this pass in the
+ * main thread's context.  The nasty bit is doing this while making
+ * sure we don't leave the cache mutex locked.  We really have to
+ * avoid the main thread blocking in any situation so that readahead
+ * doesn't make fsck *less* stable.  Pthreads doesn't offer any
+ * reasonable facilities for doing this, though.
+ *
+ * The solution is exactly the same one as the main fsck thread: Block
+ * cancellation and occasionally check at predetermined safe places if
+ * we should exit now.
+ *
+ * So the interface for cleaning up properly after passes goes like
+ * this:
+ *
+ * In the pass-specific init routine, set the thread_exit pointer to
+ * your exit function - after completing all setup that will be torn
+ * down on exit.
+ *
+ * Call readahead_thread_setup at start of the pass-specific main
+ * routine to disable cancellation.
+ *
+ * When the pass completes normally, it should call
+ * readahead_thread_exit().  This will exit the thread.  Pass-specific
+ * clean up will not happen until the main thread calls the pass exit
+ * function (clean up should be done in the same context as
+ * initialization).
+ *
+ * If something goes drastically wrong, the pass has to be restarted,
+ * or readahead needs to be disabled immediately for any reason, call
+ * ext2fs_readahead_kill().  This will set the "exit and cleanup"
+ * variable.  The readahead thread will eventually check it and
+ * correctly clean up and kill the current readahead thread.  From
+ * this point on, readahead is disabled and no future readahead
+ * requests will actually start.  If you want readahead to start
+ * again, start over again with ext2fs_readahead_init().
+ *
+ * Throughout the pass-specific readahead, use readahead_test_exit()
+ * to frequenttly check for the exit condition (e.g., don't do too
+ * much IO without checking for exit).
+ */
+
+/*
+ * Generic post-thread create setup.  Run it first thing as soon as
+ * the thread is created (in the new thread's context).
+ */
+
+static void readahead_thread_setup(struct readahead_state *ra)
+{
+	/*
+	 * Disable cancellation so that we can exit only at safe
+	 * points.
+	 */
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+}
+
+/*
+ * Normal pass completion exit handler.
+ */
+
+static void readahead_thread_exit(struct readahead_state *ra)
+{
+	finish_ios(ra);
+}
+
+/*
+ * Check if readahead should exit entirely.  Currently it's just a
+ * check and an exit, but if anything is added, note that we do not
+ * want to do anything fancy because we're trying to kill all
+ * readahead activity as soon as possible without doing anything
+ * dangerous.  All pass-specific cleanup is done in the context of the
+ * caller.
+ *
+ * This function may not be called with the cache mutex held.
+ */
+
+static void readahead_test_exit(struct readahead_state *ra)
+{
+	if (!ra->should_exit)
+		return;
+
+	fprintf(stderr, "Readahead thread dying an untimely death.\n");
+
+	ra->enabled = 0;
+	pthread_exit(NULL);
+}
+
+/*
+ * Exit now.  Initiated from the readahead thread, rather than the
+ * main thread as in ext2fs_readahead_kill().
+ *
+ * This function may not be called with the cache mutex held.
+ */
+
+static void readhead_thread_exit_now(struct readahead_state *ra)
+{
+	ra->should_exit = 1;
+	readahead_test_exit(ra);
+}
+
+/*
+ * The inode readahead main function iterates over all the inodes in
+ * the file system, reading the indirect blocks if necessary, to get
+ * data in cache as efficiently as possible.  The main thread issues
+ * IOs synchronously and as-needed; we can do better.
+ */
+
+void * readahead_inode_loop(void *arg)
+{
+	struct readahead_state *ra = (struct readahead_state *) arg;
+	struct ext2_inode *inode;
+	ext2_ino_t ino = 0;
+
+	readahead_thread_setup(ra);
+
+	dprintf(" RA: %s\n", __FUNCTION__);
+
+	ext2fs_set_inode_callback(ra->scan, readahead_scan_callback, ra);
+
+	/* Start first bg readahead, rest are triggered by callback */
+	readahead_inode_tables(ra, 0, 1);
+
+	do {
+		ext2fs_get_next_inode_ptr(ra->scan, &ino, &inode);
+		/* Is it interesting? Readahead its indirect blocks */
+		if (!ra->should_read_inode || ra->should_read_inode(inode))
+			readahead_inode(ra, inode, ino);
+		/*
+		 * Check after every inode for exit request; we can
+		 * process a lot of inodes before hitting any other
+		 * exit points.
+		 */
+		readahead_test_exit(ra);
+	} while (ino);
+
+	readahead_thread_exit(ra);
+	/* Return value ignored */
+	return NULL;
+}
+
+/*
+ * Helper function for ext2fs_dblist_iterate.
+ */
+
+static int iter_dblist(ext2_filsys fs, struct ext2_db_entry *db,
+		       void *priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+
+	queue_blks(ra, db->blk, 1);
+
+	return 0;
+}
+
+/*
+ * Do the directory block readahead.
+ */
+
+void * readahead_dblist(void *arg)
+{
+	struct readahead_state *ra = (struct readahead_state *) arg;
+	errcode_t err;
+
+	readahead_thread_setup(ra);
+
+	err = ext2fs_dblist_iterate(ra->dblist, iter_dblist, ra);
+
+	if (err)
+		fprintf(stderr, "Error traversing directory blocks\n");
+
+	readahead_thread_exit(ra);
+
+	/* Return value ignored */
+	return NULL;
+}
+
+/*
+ * The client routines for readahead are called only by the main
+ * thread.  The rest of the readahead functions are called only by the
+ * readahead thread.  The client initializes, starts, and stops the
+ * readahead thread.  It informs the readahead thread when it has
+ * finished reading blocks from the cache.
+ *
+ * The distinction between the two threads is especially important
+ * when it comes to lseek()/read()/write() on the device file
+ * descriptor.  The two threads must never share an fd or the lseek()s
+ * will race with each other.  The readahead thread's fd is in the
+ * readahead struct; the main thread's fd is in the context struct
+ * (both are inside the ext2_filsys sturcture).  The main symptom of
+ * this bug is getting back data which is from somewhere on disk, just
+ * not the somewhere you wanted.
+ *
+ */
+
+static int readahead_disabled(struct readahead_state *ra)
+{
+	if ((ra == NULL) || !ra->enabled)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Put the cache and associated locks into the right state to start a
+ * readahead thread.
+ */
+
+static void readahead_cache_init(struct readahead_state *ra)
+{
+	pthread_mutex_init(&ra->cache_mutex, NULL);
+	pthread_cond_init(&ra->cache_wake, NULL);
+
+	ra->cache_used = 0;
+	ra->cache_needed = 0;
+	ra->cache_readahead_done = 0;
+}
+
+errcode_t ext2fs_readahead_init(ext2_filsys fs, unsigned int max_ios,
+				unsigned int cache_max,
+				struct readahead_state **ret_ra)
+{
+	struct readahead_state *ra;
+	int retval;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	*ret_ra = NULL;
+
+	/*
+	 * Don't initialize readahead unless explicitly enabled by the
+	 * user passing the max ios argument.
+	 */
+
+	if (max_ios == 0)
+		return 0;
+
+	retval = ext2fs_get_mem(sizeof(*ra), &ra);
+	if (retval)
+		return retval;
+
+	/* Reopen the device so we don't share the file pointer */
+	retval = ext2fs_open2(fs->device_name, NULL,
+			      /* Don't ask for exclusive open because
+			       * we definitely won't get it. */
+			      (fs->flags && ~IO_FLAG_EXCLUSIVE), 0, 0,
+			      fs->io->manager, &ra->fs);
+	if (retval)
+		goto out;
+
+	/* Thread exit/kill management */
+	ra->should_exit = 0;
+	ra->thread_exit = NULL;
+
+	/* IO queue set up */
+	ra->max_ios = max_ios;
+	dprintf("MT: %s: Maximum ios %u\n", __FUNCTION__, ra->max_ios);
+	ra->timeout = 100; /* milliseconds, currently not used */
+	/* Not sure how big to make this, but must be at least max_ios */
+	ra->io_queue_size = ra->max_ios * 10;
+	dprintf("MT: %s: IO queue size %u\n", __FUNCTION__, ra->io_queue_size);
+	ra->ios_queued = 0;
+	retval = ext2fs_get_mem(sizeof(ra->io_queue[0]) * ra->io_queue_size,
+				&ra->io_queue);
+	if (retval)
+		goto out_close;
+	/* Clear the queue for debugging purposes */
+	memset(ra->io_queue, 0, sizeof(ra->io_queue[0]) * ra->io_queue_size);
+
+	/* Cache set up */
+
+	/*
+	 * Max memory must be at least as big as an inode table.  Make
+	 * the minimum four times that so we can get more than one
+	 * block group ahead and read some indirect blocks too.
+	 */
+	ra->cache_max = cache_max;
+	/* XXX currently measuring cache_max in file system blocks, yuck */
+	if (ra->cache_max < ra->fs->inode_blocks_per_group) {
+		ra->cache_max = 4 * ra->fs->inode_blocks_per_group;
+		if (cache_max != 0)
+			/* XXX use ext2 printing routines */
+			fprintf(stderr, "Maximum cache %u blocks too small; "
+				"raised to %u blocks", cache_max,
+				ra->cache_max);
+	}
+	dprintf("MT: %s: Maximum cache %u\n", __FUNCTION__, ra->cache_max);
+
+	readahead_cache_init(ra);
+	/*
+	 * When the cache gets full, the readahead thread sleeps.
+	 * When the cache gets empty, the main thread sleeps.  If we
+	 * wake the threads at empty/full, then we end up with an
+	 * inefficient ping-pong effect where the readahead thread
+	 * only issues one io before going back to sleep.  The ideal
+	 * pattern is where the readahead thread caches a bunch of
+	 * blocks, then goes back to sleep until a lot of the cache is
+	 * free again, so it issue lots of ios in parallel.  When the
+	 * main thread wakes/sleeps is less important, other than
+	 * avoiding a lot of signal/wake traffic if it's near the
+	 * readahead thread wake point or near the cache boundaries.
+	 * Wait to wake it up until the cache is close to full.
+	 *
+	 * XXX Should wake main thread sooner?
+	 */
+
+	/* Restart readahead when the cache gets this low */
+	ra->cache_restart_ra = ra->cache_max / 3;
+	dprintf("MT: %s: Readahead restarts at %u\n", __FUNCTION__,
+	       ra->cache_restart_ra);
+	ra->cache_restart_main = (ra->cache_max * 2) / 3;
+	dprintf("MT: %s: Main thread restarts at %u\n", __FUNCTION__,
+	       ra->cache_restart_main);
+
+	/* Maxium io size (in blocks) should be less than the cache size */
+	/* XXX should be command line option */
+	ra->max_io_size = (256 * 1024) / ra->fs->blocksize;
+	if (ra->max_io_size > ra->cache_max) {
+		dprintf("MT: %s: Maximum io size %u larger than cache %u\n",
+			__FUNCTION__, ra->max_io_size, ra->cache_max);
+		ra->max_io_size = ra->cache_max / 4;
+	}
+	dprintf("MT: %s: Maximum io size %u\n", __FUNCTION__, ra->max_io_size);
+
+	/* Make scratch buffer big enough for largest ios */
+	retval = ext2fs_get_mem(ra->max_io_size * ra->fs->blocksize,
+				&ra->scratch_buf);
+	if (retval)
+		goto out_free_queue;
+
+	ra->enabled = 1;
+	*ret_ra = ra;
+	return 0;
+
+ out_free_queue:
+	ext2fs_free_mem(&ra->io_queue);
+ out_close:
+	ext2fs_close(ra->fs);
+ out:
+	ext2fs_free_mem(&ra);
+	fprintf(stderr, "Readahead failed to initialize.\n");
+	return retval;
+}
+
+void ext2fs_readahead_exit(struct readahead_state **ret_ra)
+{
+	struct readahead_state *ra = *ret_ra;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (!ra)
+		return;
+
+	pthread_mutex_destroy(&ra->cache_mutex);
+	pthread_cond_destroy(&ra->cache_wake);
+	ext2fs_free_mem(&ra->scratch_buf);
+	ext2fs_free_mem(&ra->io_queue);
+	ext2fs_close(ra->fs);
+	ext2fs_free_mem(&ra);
+	*ret_ra = NULL;
+}
+
+/*
+ * Release blocks that have already been read.  Primitive for the
+ * various release runctions.
+ */
+
+static void blocks_release(struct readahead_state *ra, ext2_filsys fs,
+			   blk_t blk, int count)
+{
+	dprintf("MT: %s: cache_used %d releasing %d\n", __FUNCTION__,
+		ra->cache_used, count);
+	pthread_mutex_lock(&ra->cache_mutex);
+	ra->cache_used -= count;
+	/*
+	 * Did we get ahead of readahead?
+	 *
+	 * XXX Main thread hangs on cache_used == 0; current
+	 * workaround is to add a bogus block to the cache on exit.
+	 */
+	if (ra->cache_used <= 0) {
+		dprintf("MT: %s: Cache empty, waking readahead thread\n",
+			__FUNCTION__);
+		pthread_cond_signal(&ra->cache_wake);
+		dprintf("MT: %s: Waiting for readahead to populate cache...\n",
+			__FUNCTION__);
+		/* XXX use timeout to avoid deadlock on fatal bugs */
+		pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+		dprintf("MT: %s: Continuing, cache_used %d\n", __FUNCTION__,
+			ra->cache_used);
+	}
+	/*
+	 * Wake readahead thread if we have enough free cache to issue
+	 * io efficiently.
+	 */
+	if ((ra->cache_used + count) > ra->cache_restart_ra &&
+	    (ra->cache_used <= ra->cache_restart_ra)) {
+		/* Do we have enough cache to satisfy any specific need? */
+		if ((ra->cache_used + ra->cache_needed <= ra->cache_max)) {
+			dprintf("MT: %s: Cache available, waking readahead "
+				"(needed %u used %d)\n", __FUNCTION__,
+				ra->cache_needed, ra->cache_used);
+			pthread_cond_signal(&ra->cache_wake);
+			/* Don't keep waking it over and over... */
+			ra->cache_needed = 0;
+		}
+	}
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/* Optional but nice: Let vm know we don't need these any more */
+	io_channel_cache_release(fs->io, blk, count);
+}
+
+/*
+ * ext2fs_block_iterate2 helper function for releasing indirect
+ * blocks.
+ */
+
+static int ind_block_release(ext2_filsys fs, blk_t *block_nr,
+		  e2_blkcnt_t blockcnt,
+		  blk_t ref_block EXT2FS_ATTR((unused)),
+		  int ref_offset EXT2FS_ATTR((unused)),
+		  void *priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+
+	/*
+	 * The blockcnt arg is the total number of data blocks (?)
+	 * traversed so far for this inode, not the number of blocks
+	 * to release.
+	 */
+	blocks_release(ra, fs, *block_nr, 1);
+
+	return 0;
+}
+
+/*
+ * Mark all cached blocks from this inode as released.
+ */
+
+void ext2fs_readahead_inode_release(struct readahead_state *ra,
+				    ext2_filsys fs, ext2_ino_t ino, char *block_buf)
+{
+	errcode_t err;
+
+	dprintf("MT: %s: %u\n", __FUNCTION__, ino);
+
+	if (readahead_disabled(ra))
+		return;
+
+	/*
+	 * Don't use ra->fs - that contains the readahead thread's fd
+	 * - using the same fd results in exciting lseek()/read()
+	 * races.
+	 */
+	err = ext2fs_block_iterate2(fs, ino, BLOCK_FLAG_IND_ONLY,
+				    block_buf, ind_block_release, ra);
+
+	/* Can't return this error, but can notify user */
+	if (err)
+		fprintf(stderr, "%s: Error %d traversing indirect blocks "
+			"for ino %u\n",  __FUNCTION__, (int) err, ino);
+}
+
+/*
+ * Mark all cached blocks from this inode table as used.
+ */
+
+void ext2fs_readahead_inode_table_release(struct readahead_state *ra,
+					  ext2_filsys fs, dgrp_t group)
+{
+	if (readahead_disabled(ra))
+		return;
+
+	dprintf("MT: %s: Finished group %u cache_used %u\n",
+		__FUNCTION__, group, ra->cache_used);
+
+	blocks_release(ra, fs, ra->fs->group_desc[group].bg_inode_table,
+		       ra->fs->inode_blocks_per_group);
+}
+
+/*
+ * Generic thread start.  Pass it the function for starting this pass
+ * of readahead.
+ */
+
+static void readahead_thread_start(struct readahead_state *ra,
+				   void * (*thread_start)(void *))
+{
+	if (pthread_create(&ra->thread, NULL, thread_start, ra)) {
+		fprintf(stderr, "Thread failed to start.\n");
+		ra->enabled = 0;
+		return;
+	}
+
+	/*
+	 * Wait for readahead to put something in the cache.  The
+	 * readahead thread might have already read something into the
+	 * cache and signaled us to wake, so only wait if nothing is
+	 * in the cache already.
+	 */
+	pthread_mutex_lock(&ra->cache_mutex);
+	if (ra->cache_used == 0)
+		pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * Signal the current readahead thread to stop.  Call it before
+ * freeing any resources that the readahead thread might access.
+ *
+ * This function must complete in the case of the thread exiting
+ * before or during this function.
+ */
+
+static void readahead_thread_stop(struct readahead_state *ra)
+{
+	ra->should_exit = 1;
+	/*
+	 * Wake readahead thread in case it's waiting on cache.  The
+	 * readahead thread must test for exit after every
+	 * pthread_cond_wait().
+	 */
+	pthread_mutex_lock(&ra->cache_mutex);
+	pthread_cond_signal(&ra->cache_wake);
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/*
+	 * At this point, the readahead thread has either (a) already
+	 * exited, or (b) is running but will call
+	 * readahead_test_exit() shortly and exit then.
+	 */
+	pthread_join(ra->thread, NULL);
+	/* Now the readahead thread is dead for sure */
+
+	/* Run the pass-specific clean up */
+	if (ra->thread_exit)
+		ra->thread_exit(ra);
+	ra->thread_exit = NULL;
+
+	readahead_cache_init(ra);
+	ra->should_exit = 0;
+
+	/*
+	 * Check to see if the readahead thread encountered some
+	 * problem and disabled readahead.  In that case, close down
+	 * readahead entirely.  To restart, call
+	 * ext2fs_readahead_init().
+	 */
+	if (readahead_disabled(ra))
+		ext2fs_readahead_exit(&ra);
+}
+
+/*
+ * Use this to kill readahead entirely and turn it off until
+ * ext2fs_readahead_init() is called again.  It is safe to call any
+ * ext2fs_readahead_* function after this since readahead will be
+ * disabled.
+ */
+
+void ext2fs_readahead_kill(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (!ra)
+		return;
+
+	readahead_thread_stop(ra);
+
+	ra->enabled = 0;
+}
+
+/*
+ * Clean up after inode readahead.
+ */
+
+static void readahead_inode_exit(struct readahead_state *ra)
+{
+	ext2fs_close_inode_scan(ra->scan);
+	ext2fs_free_mem(&ra->triple_buf);
+	ext2fs_free_mem(&ra->double_buf);
+	ra->should_read_inode = NULL;
+}
+
+/*
+ * Initialize inode readahead state.
+ */
+
+void ext2fs_readahead_inode_init(struct readahead_state *ra,
+				 int (*should_read_inode)(struct ext2_inode *))
+{
+	if (readahead_disabled(ra))
+		return;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	/*
+	 * Allocate buffers for saving triple and double indirect
+	 * blocks while recursively queueing and issuing ios for the
+	 * block pointers they contain.  Don't need any for single
+	 * indirect blocks because we don't have to read the data
+	 * blocks they point to.
+	 */
+
+	if (ext2fs_get_mem(ra->fs->blocksize, &ra->double_buf))
+		goto out;
+
+	if (ext2fs_get_mem(ra->fs->blocksize, &ra->triple_buf))
+		goto out_free_double;
+
+	if (ext2fs_open_inode_scan(ra->fs, 8, &ra->scan)) {
+		fprintf(stderr, "Open inode scan failed, disabling "
+			"readahead\n");
+		goto out_free_double;
+	}
+
+	ra->should_read_inode = should_read_inode;
+	ra->thread_exit = readahead_inode_exit;
+
+	return;
+
+ out_free_double:
+	ext2fs_free_mem(&ra->double_buf);
+ out:
+	fprintf(stderr, "Inode readahead failed to initialize.\n");
+	ra->enabled = 0;
+}
+
+/*
+ * Kick off inode readahead.
+ */
+
+void ext2fs_readahead_inode_start(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_start(ra, readahead_inode_loop);
+}
+
+/*
+ * Stop the inode readahead thread.  readahead_thread_stop() does all
+ * the actual work, including running the thread_exit function.
+ */
+
+void ext2fs_readahead_inode_exit(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_stop(ra);
+}
+
+/*
+ * Release a block from the directory block list.
+ */
+
+void ext2fs_readahead_dblist_release(struct readahead_state *ra,
+				     ext2_filsys fs, blk_t blk)
+{
+	if (readahead_disabled(ra))
+		return;
+
+	blocks_release(ra, fs, blk, 1);
+}
+
+/*
+ * Clean up after directory block readahead.
+ */
+
+static void readahead_dblist_exit(struct readahead_state *ra)
+{
+	ra->dblist = NULL;
+}
+
+/*
+ * Set up the directory block readahead thread.
+ */
+
+void ext2fs_readahead_dblist_init(struct readahead_state *ra, ext2_dblist dblist)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	ra->dblist = dblist;
+	ra->thread_exit = readahead_dblist_exit;
+}
+
+/*
+ * Start the directory block readahead thread.
+ *
+ * Don't alter the dblist (i.e., sort it) after starting the dblist
+ * readahead, or you'll have some fabulous race conditions.  Note that
+ * ext2_dblist_iterate will sort the dblist if it's not already
+ * sorted.  Sort the dblist BEFORE you call this function.
+ */
+
+void ext2fs_readahead_dblist_start(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_start(ra, readahead_dblist);
+}
+
+void ext2fs_readahead_dblist_exit(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_stop(ra);
+}
--- e2fsprogs-1.40.4.orig/e2fsck/pass2.c
+++ e2fsprogs-1.40.4/e2fsck/pass2.c
@@ -148,9 +148,17 @@ void e2fsck_pass2(e2fsck_t ctx)
 
 	if (fs->super->s_feature_compat & EXT2_FEATURE_COMPAT_DIR_INDEX)
 		ext2fs_dblist_sort(fs->dblist, special_dir_block_cmp);
-	
+	else
+		ext2fs_dblist_sort(fs->dblist, 0);
+
+	/* Start readahead after the dblist has been sorted */
+	ext2fs_readahead_dblist_init(ctx->ra, fs->dblist);
+	ext2fs_readahead_dblist_start(ctx->ra);
+
 	cd.pctx.errcode = ext2fs_dblist_iterate(fs->dblist, check_dir_block,
 						&cd);
+	ext2fs_readahead_dblist_exit(ctx->ra);
+
 	if (ctx->flags & E2F_FLAG_SIGNAL_MASK)
 		return;
 	if (cd.pctx.errcode) {
@@ -778,6 +786,8 @@ static int check_dir_block(ext2_filsys f
 	
 	old_op = ehandler_operation(_("reading directory block"));
 	cd->pctx.errcode = ext2fs_read_dir_block(fs, block_nr, buf);
+	/* Release the block now - it is already in our memory */
+	ext2fs_readahead_dblist_release(ctx->ra, fs, block_nr);
 	ehandler_operation(0);
 	if (cd->pctx.errcode == EXT2_ET_DIR_CORRUPTED)
 		cd->pctx.errcode = 0; /* We'll handle this ourselves */
--- e2fsprogs-1.40.4.orig/lib/ext2fs/inode.c
+++ e2fsprogs-1.40.4/lib/ext2fs/inode.c
@@ -491,6 +491,76 @@ errcode_t ext2fs_get_next_inode_full(ext
 	return retval;
 }
 
+errcode_t ext2fs_get_next_inode_ptr(ext2_inode_scan scan, ext2_ino_t *ino,
+				    struct ext2_inode **inode)
+{
+	errcode_t	retval;
+	int		extra_bytes = 0;
+
+	EXT2_CHECK_MAGIC(scan, EXT2_ET_MAGIC_INODE_SCAN);
+
+	/*
+	 * Do we need to start reading a new block group?
+	 */
+	if (scan->inodes_left <= 0) {
+	force_new_group:
+		if (scan->done_group) {
+			retval = (scan->done_group)
+				(scan->fs, scan, scan->current_group,
+				 scan->done_group_data);
+			if (retval)
+				return retval;
+		}
+		if (scan->groups_left <= 0) {
+			*ino = 0;
+			return 0;
+		}
+		retval = get_next_blockgroup(scan);
+		if (retval)
+			return retval;
+	}
+	/*
+	 * These checks are done outside the above if statement so
+	 * they can be done for block group #0.
+	 */
+	if ((scan->scan_flags & EXT2_SF_DO_LAZY) &&
+	    (scan->fs->group_desc[scan->current_group].bg_flags &
+	     EXT2_BG_INODE_UNINIT))
+		goto force_new_group;
+	if (scan->current_block == 0) {
+		if (scan->scan_flags & EXT2_SF_SKIP_MISSING_ITABLE) {
+			goto force_new_group;
+		} else
+			return EXT2_ET_MISSING_INODE_TABLE;
+	}
+
+	/*
+	 * Have we run out of space in the inode buffer?  If so, we
+	 * need to read in more blocks.
+	 */
+	if (scan->bytes_left < scan->inode_size) {
+		memcpy(scan->temp_buffer, scan->ptr, scan->bytes_left);
+		extra_bytes = scan->bytes_left;
+
+		retval = get_next_blocks(scan);
+		if (retval)
+			return retval;
+	}
+
+	/* XXX ignoring extra_bytes logic */
+	/* Return a pointer directly to the inode buffer... Don't write it. */
+	*inode = (struct ext2_inode *) scan->ptr;
+	scan->ptr += scan->inode_size;
+	scan->bytes_left -= scan->inode_size;
+	if (scan->scan_flags & EXT2_SF_BAD_INODE_BLK)
+		retval = EXT2_ET_BAD_BLOCK_IN_INODE_TABLE;
+
+	scan->inodes_left--;
+	scan->current_inode++;
+	*ino = scan->current_inode;
+	return retval;
+}
+
 errcode_t ext2fs_get_next_inode(ext2_inode_scan scan, ext2_ino_t *ino,
 				struct ext2_inode *inode)
 {

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-16 21:30 ` [RFC] Parallelize IO for e2fsck Valerie Henson
@ 2008-01-18  1:15   ` David Chinner
  2008-01-18  1:43     ` Valerie Henson
  2008-01-21 23:00   ` Andreas Dilger
  1 sibling, 1 reply; 29+ messages in thread
From: David Chinner @ 2008-01-18  1:15 UTC (permalink / raw)
  To: Valerie Henson
  Cc: linux-fsdevel, linux-ext4, linux-kernel, Theodore Ts'o,
	Andreas Dilger, Ric Wheeler

On Wed, Jan 16, 2008 at 01:30:43PM -0800, Valerie Henson wrote:
> Hi y'all,
> 
> This is a request for comments on the rewrite of the e2fsck IO
> parallelization patches I sent out a few months ago.  The mechanism is
> totally different.  Previously IO was parallelized by issuing IOs from
> multiple threads; now a single thread issues fadvise(WILLNEED) and
> then uses read() to complete the IO.

Interesting.

We ultimately rejected a similar patch to xfs_repair (pre-population
the kernel block device cache) mainly because of low memory
performance issues and it doesn't really enable you to do anything
particularly smart with optimising I/O patterns for larger, high
performance RAID arrays.

The low memory problems were particularly bad; the readahead
thrashing cause a slowdown of 2-3x compared to the baseline and
often it was due to the repair process requiring all of memory
to cache stuff it would need later. IIRC, multi-terabyte ext3
filesystems have similar memory usage problems to XFS, so there's
a good chance that this patch will see the same sorts of issues.

> Single disk performance doesn't change, but elapsed time drops by
> about 50% on a big RAID-5 box.  Passes 1 and 2 are parallelized.  Pass
> 5 is left as an exercise for the reader.

Promising results, though....

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-18  1:15   ` David Chinner
@ 2008-01-18  1:43     ` Valerie Henson
  0 siblings, 0 replies; 29+ messages in thread
From: Valerie Henson @ 2008-01-18  1:43 UTC (permalink / raw)
  To: David Chinner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, Theodore Ts'o,
	Andreas Dilger, Ric Wheeler

On Jan 17, 2008 5:15 PM, David Chinner <dgc@sgi.com> wrote:
> On Wed, Jan 16, 2008 at 01:30:43PM -0800, Valerie Henson wrote:
> > Hi y'all,
> >
> > This is a request for comments on the rewrite of the e2fsck IO
> > parallelization patches I sent out a few months ago.  The mechanism is
> > totally different.  Previously IO was parallelized by issuing IOs from
> > multiple threads; now a single thread issues fadvise(WILLNEED) and
> > then uses read() to complete the IO.
>
> Interesting.
>
> We ultimately rejected a similar patch to xfs_repair (pre-population
> the kernel block device cache) mainly because of low memory
> performance issues and it doesn't really enable you to do anything
> particularly smart with optimising I/O patterns for larger, high
> performance RAID arrays.
>
> The low memory problems were particularly bad; the readahead
> thrashing cause a slowdown of 2-3x compared to the baseline and
> often it was due to the repair process requiring all of memory
> to cache stuff it would need later. IIRC, multi-terabyte ext3
> filesystems have similar memory usage problems to XFS, so there's
> a good chance that this patch will see the same sorts of issues.

That was one of my first concerns - how to avoid overflowing memory?
Whenever I screw it up on e2fsck, it does go, oh, 2 times slower due
to the minor detail of every single block being read from disk twice.
:)

I have a partial solution that sort of blindly manages the buffer
cache.  First, the user passes e2fsck a parameter saying how much
memory is available as buffer cache.  The readahead thread reads
things in and immediately throws them away so they are only in buffer
cache (no double-caching).  Then readahead and e2fsck work together so
that readahead only reads in new blocks when the main thread is done
with earlier blocks.  The already-used blocks get kicked out of buffer
cache to make room for the new ones.

What would be nice is to take into account the current total memory
usage of the whole fsck process and factor that in.  I don't think it
would be hard to add to the existing cache management framework.
Thoughts?

> Promising results, though....

Thanks!  It's solving a rather simpler problem than XFS check/repair. :)

-VAL

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-16 21:30 ` [RFC] Parallelize IO for e2fsck Valerie Henson
  2008-01-18  1:15   ` David Chinner
@ 2008-01-21 23:00   ` Andreas Dilger
  2008-01-22  3:38     ` David Chinner
  1 sibling, 1 reply; 29+ messages in thread
From: Andreas Dilger @ 2008-01-21 23:00 UTC (permalink / raw)
  To: Valerie Henson
  Cc: linux-fsdevel, linux-ext4, linux-kernel, Theodore Ts'o,
	Andreas Dilger, Ric Wheeler

On Jan 16, 2008  13:30 -0800, Valerie Henson wrote:
> I have a partial solution that sort of blindly manages the buffer
> cache.  First, the user passes e2fsck a parameter saying how much
> memory is available as buffer cache.  The readahead thread reads
> things in and immediately throws them away so they are only in buffer
> cache (no double-caching).  Then readahead and e2fsck work together so
> that readahead only reads in new blocks when the main thread is done
> with earlier blocks.  The already-used blocks get kicked out of buffer
> cache to make room for the new ones.
>
> What would be nice is to take into account the current total memory
> usage of the whole fsck process and factor that in.  I don't think it
> would be hard to add to the existing cache management framework.
> Thoughts?

I discussed this with Ted at one point also.  This is a generic problem,
not just for readahead, because "fsck" can run multiple e2fsck in parallel
and in case of many large filesystems on a single node this can cause
memory usage problems also.

What I was proposing is that "fsck.{fstype}" be modified to return an
estimated minimum amount of memory needed, and some "desired" amount of
memory (i.e. readahead) to fsck the filesystem, using some parameter like
"fsck.{fstype} --report-memory-needed /dev/XXX".  If this does not
return the output in the expected format, or returns an error then fsck
will assume some amount of memory based on the device size and continue
as it does today.

If the fsck.{fstype} does understand this parameter, then fsck makes a
decision based on devices, parallelism, total RAM (less some amount to
avoid thrashing), then it can call the individual fsck commands with
"--maximum-memory MMM /dev/XXX" so each knows how much cache it can
allocate.  This parameter can also be specified by the user if running
e2fsck directly.

I haven't looked through your patch yet, but I hope to get to it soon.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-21 23:00   ` Andreas Dilger
@ 2008-01-22  3:38     ` David Chinner
  2008-01-22  4:17       ` Valdis.Kletnieks
                         ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: David Chinner @ 2008-01-22  3:38 UTC (permalink / raw)
  To: Valerie Henson, linux-fsdevel, linux-ext4, linux-kernel,
	Theodore Ts'o, Andreas Dilger, Ric Wheeler

On Mon, Jan 21, 2008 at 04:00:41PM -0700, Andreas Dilger wrote:
> On Jan 16, 2008  13:30 -0800, Valerie Henson wrote:
> > I have a partial solution that sort of blindly manages the buffer
> > cache.  First, the user passes e2fsck a parameter saying how much
> > memory is available as buffer cache.  The readahead thread reads
> > things in and immediately throws them away so they are only in buffer
> > cache (no double-caching).  Then readahead and e2fsck work together so
> > that readahead only reads in new blocks when the main thread is done
> > with earlier blocks.  The already-used blocks get kicked out of buffer
> > cache to make room for the new ones.
> >
> > What would be nice is to take into account the current total memory
> > usage of the whole fsck process and factor that in.  I don't think it
> > would be hard to add to the existing cache management framework.
> > Thoughts?
> 
> I discussed this with Ted at one point also.  This is a generic problem,
> not just for readahead, because "fsck" can run multiple e2fsck in parallel
> and in case of many large filesystems on a single node this can cause
> memory usage problems also.
> 
> What I was proposing is that "fsck.{fstype}" be modified to return an
> estimated minimum amount of memory needed, and some "desired" amount of
> memory (i.e. readahead) to fsck the filesystem, using some parameter like
> "fsck.{fstype} --report-memory-needed /dev/XXX".  If this does not
> return the output in the expected format, or returns an error then fsck
> will assume some amount of memory based on the device size and continue
> as it does today.

And while fsck is running, some other program runs that uses
memory and blows your carefully calculated paramters to smithereens?

I think there is a clear need for applications to be able to
register a callback from the kernel to indicate that the machine as
a whole is running out of memory and that the application should
trim it's caches to reduce memory utilisation.

Perhaps instead of swapping immediately, a SIGLOWMEM could be sent
to a processes that aren't masking the signal followed by a short
grace period to allow the processes to free up some memory before
swapping out pages from that process?

With this sort of feedback, the fsck process can scale back it's
readahead and remove cached info that is not critical to what it
is currently doing and thereby prevent readahead thrashing as
memory usage of the fsck process itself grows.

Another example where this could be useful is to tell browsers to
release some of their cache rather than having the VM swap it out.

IMO, a scheme like this will be far more reliable than trying to
guess what the optimal settings are going to be over the whole
lifetime of a process....

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  3:38     ` David Chinner
@ 2008-01-22  4:17       ` Valdis.Kletnieks
  2008-01-22  7:00         ` Andreas Dilger
  2008-01-22  7:05       ` Andreas Dilger
  2008-01-22 17:42       ` Bryan Henderson
  2 siblings, 1 reply; 29+ messages in thread
From: Valdis.Kletnieks @ 2008-01-22  4:17 UTC (permalink / raw)
  To: David Chinner
  Cc: Valerie Henson, linux-fsdevel, linux-ext4, linux-kernel,
	Theodore Ts'o, Andreas Dilger, Ric Wheeler

[-- Attachment #1: Type: text/plain, Size: 494 bytes --]

On Tue, 22 Jan 2008 14:38:30 +1100, David Chinner said:

> Perhaps instead of swapping immediately, a SIGLOWMEM could be sent
> to a processes that aren't masking the signal followed by a short
> grace period to allow the processes to free up some memory before
> swapping out pages from that process?

AIX had SIGDANGER some 15 years ago.  Admittedly, that was sent when
the system was about to hit OOM, not when it was about to start swapping.

I suspect both approaches have their merits...

[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  4:17       ` Valdis.Kletnieks
@ 2008-01-22  7:00         ` Andreas Dilger
  2008-01-22 13:05           ` Alan Cox
  2008-01-22 14:40           ` Theodore Tso
  0 siblings, 2 replies; 29+ messages in thread
From: Andreas Dilger @ 2008-01-22  7:00 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: David Chinner, Valerie Henson, linux-fsdevel, linux-ext4,
	linux-kernel, Theodore Ts'o, Andreas Dilger, Ric Wheeler

On Jan 21, 2008  23:17 -0500, Valdis.Kletnieks@vt.edu wrote:
> On Tue, 22 Jan 2008 14:38:30 +1100, David Chinner said:
> > Perhaps instead of swapping immediately, a SIGLOWMEM could be sent
> > to a processes that aren't masking the signal followed by a short
> > grace period to allow the processes to free up some memory before
> > swapping out pages from that process?
> 
> AIX had SIGDANGER some 15 years ago.  Admittedly, that was sent when
> the system was about to hit OOM, not when it was about to start swapping.

I'd tried to advocate SIGDANGER some years ago as well, but none of
the kernel maintainers were interested.  It definitely makes sense
to have some sort of mechanism like this.  At the time I first brought
it up it was in conjunction with Netscape using too much cache on some
system, but it would be just as useful for all kinds of other memory-
hungry applications.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  3:38     ` David Chinner
  2008-01-22  4:17       ` Valdis.Kletnieks
@ 2008-01-22  7:05       ` Andreas Dilger
  2008-01-22  8:16         ` David Chinner
  2008-01-22 17:42       ` Bryan Henderson
  2 siblings, 1 reply; 29+ messages in thread
From: Andreas Dilger @ 2008-01-22  7:05 UTC (permalink / raw)
  To: David Chinner
  Cc: Valerie Henson, linux-fsdevel, linux-ext4, linux-kernel,
	Theodore Ts'o, Andreas Dilger, Ric Wheeler

On Jan 22, 2008  14:38 +1100, David Chinner wrote:
> On Mon, Jan 21, 2008 at 04:00:41PM -0700, Andreas Dilger wrote:
> > I discussed this with Ted at one point also.  This is a generic problem,
> > not just for readahead, because "fsck" can run multiple e2fsck in parallel
> > and in case of many large filesystems on a single node this can cause
> > memory usage problems also.
> > 
> > What I was proposing is that "fsck.{fstype}" be modified to return an
> > estimated minimum amount of memory needed, and some "desired" amount of
> > memory (i.e. readahead) to fsck the filesystem, using some parameter like
> > "fsck.{fstype} --report-memory-needed /dev/XXX".  If this does not
> > return the output in the expected format, or returns an error then fsck
> > will assume some amount of memory based on the device size and continue
> > as it does today.
> 
> And while fsck is running, some other program runs that uses
> memory and blows your carefully calculated paramters to smithereens?

Well, fsck has a rather restricted working environment, because it is
run before most other processes start (i.e. single-user mode).  For fsck
initiated by an admin in other runlevels the admin would need to specify
the upper limit of memory usage.  My proposal was only for the single-user
fsck at boot time.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  7:05       ` Andreas Dilger
@ 2008-01-22  8:16         ` David Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: David Chinner @ 2008-01-22  8:16 UTC (permalink / raw)
  To: David Chinner, Valerie Henson, linux-fsdevel, linux-ext4,
	linux-kernel, Theodore Ts'o, Andreas Dilger, Ric Wheeler

On Tue, Jan 22, 2008 at 12:05:11AM -0700, Andreas Dilger wrote:
> On Jan 22, 2008  14:38 +1100, David Chinner wrote:
> > On Mon, Jan 21, 2008 at 04:00:41PM -0700, Andreas Dilger wrote:
> > > I discussed this with Ted at one point also.  This is a generic problem,
> > > not just for readahead, because "fsck" can run multiple e2fsck in parallel
> > > and in case of many large filesystems on a single node this can cause
> > > memory usage problems also.
> > > 
> > > What I was proposing is that "fsck.{fstype}" be modified to return an
> > > estimated minimum amount of memory needed, and some "desired" amount of
> > > memory (i.e. readahead) to fsck the filesystem, using some parameter like
> > > "fsck.{fstype} --report-memory-needed /dev/XXX".  If this does not
> > > return the output in the expected format, or returns an error then fsck
> > > will assume some amount of memory based on the device size and continue
> > > as it does today.
> > 
> > And while fsck is running, some other program runs that uses
> > memory and blows your carefully calculated paramters to smithereens?
> 
> Well, fsck has a rather restricted working environment, because it is
> run before most other processes start (i.e. single-user mode).  For fsck
> initiated by an admin in other runlevels the admin would need to specify
> the upper limit of memory usage.  My proposal was only for the single-user
> fsck at boot time.

The simple case. ;)

Because XFS has shutdown features, it's not uncommon to hear about
people running xfs_repair on an otherwise live system. e.g. XFS
detects a corrupted block, shuts down the filesystem, the admin
unmounts it, runs xfs_repair, puts it back online. meanwhile, all
the other filesystems and users continue unaffected. In this use
case, getting feedback about memory usage is, IMO, very worthwhile.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  7:00         ` Andreas Dilger
@ 2008-01-22 13:05           ` Alan Cox
  2008-01-22 14:40           ` Theodore Tso
  1 sibling, 0 replies; 29+ messages in thread
From: Alan Cox @ 2008-01-22 13:05 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Theodore Ts'o, Andreas Dilger,
	Ric Wheeler

> I'd tried to advocate SIGDANGER some years ago as well, but none of
> the kernel maintainers were interested.  It definitely makes sense
> to have some sort of mechanism like this.  At the time I first brought
> it up it was in conjunction with Netscape using too much cache on some
> system, but it would be just as useful for all kinds of other memory-
> hungry applications.

There is an early thread for a /proc file which you can add to your
poll() set and it will wake people when memory is low. Very elegant and
if async support is added it will also give you the signal variant for
free.

Alan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  7:00         ` Andreas Dilger
  2008-01-22 13:05           ` Alan Cox
@ 2008-01-22 14:40           ` Theodore Tso
  2008-01-22 14:57             ` Arnaldo Carvalho de Melo
  2008-01-28 19:30             ` Pavel Machek
  1 sibling, 2 replies; 29+ messages in thread
From: Theodore Tso @ 2008-01-22 14:40 UTC (permalink / raw)
  To: Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Andreas Dilger, Ric Wheeler

On Tue, Jan 22, 2008 at 12:00:50AM -0700, Andreas Dilger wrote:
> > AIX had SIGDANGER some 15 years ago.  Admittedly, that was sent when
> > the system was about to hit OOM, not when it was about to start swapping.
> 
> I'd tried to advocate SIGDANGER some years ago as well, but none of
> the kernel maintainers were interested.  It definitely makes sense
> to have some sort of mechanism like this.  At the time I first brought
> it up it was in conjunction with Netscape using too much cache on some
> system, but it would be just as useful for all kinds of other memory-
> hungry applications.

It's been discussed before, but I suspect the main reason why it was
never done is no one submitted a patch.  Also, the problem is actually
a pretty complex one.  There are a couple of different stages where
you might want to send an alert to processes:

    * Data is starting to get ejected from page/buffer cache
    * System is starting to swap
    * System is starting to really struggle to find memory
    * System is starting an out-of-memory killer

AIX's SIGDANGER really did the last two, where the OOM killer would
tend to avoid processes that had a SIGDANGER handler in favor of
processes that were SIGDANGER unaware.

Then there is the additional complexity in Linux that you have
multiple zones of memory, which at least on the historically more
popular x86 was highly, highly important.  You could say that whenever
there is sufficient memory pressure in any zone that you start
ejecting data from caches or start to swap that you start sending the
signals --- but on x86 systems with lowmem, that could happen quite
frequently, and since a user process has no idea whether its resources
are in lowmem or highmem, there's not much you can do about this.

Hopefully this is less of an issue today, since the 2.6 VM is much
more better behaved, and people are gradually moving over to x86_64
anyway.  (Sorry SGI and Intel, unfortunately they're not moving over
to the Itanic :-).   So maybe this would be better received now.

Bringing us back to the main topic at hand, one of the tradeoffs in
Val's current approach is that by relying on the kernel's buffer
cache, we don't have to worry about locking and coherency at the
userspace level.  OTOH, we give up low-level control about when memory
gets thrown out, and it also means that simply getting notified when
the system starts to swap isn't good enough.  We need to know much
earlier, when the system starts ejecting data from the buffer and page
caches.

Does this matter?  Well, there are a couple of use cases:

     * The restricted boot environment
     * The background "once a month" take a snapshot and check
     * The oh-my-gosh we-lost-a-filesystem -- repair it while the 
       IMAP server is still on-line serving data from the other 
       mounted filesystems.

It's the last case where things get really tricky....

     	      	   	 	    - Ted

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22 14:40           ` Theodore Tso
@ 2008-01-22 14:57             ` Arnaldo Carvalho de Melo
  2008-01-28 19:30             ` Pavel Machek
  1 sibling, 0 replies; 29+ messages in thread
From: Arnaldo Carvalho de Melo @ 2008-01-22 14:57 UTC (permalink / raw)
  To: Theodore Tso, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Andreas Dilger,
	Ric Wheeler, marcelo

Em Tue, Jan 22, 2008 at 09:40:52AM -0500, Theodore Tso escreveu:
> On Tue, Jan 22, 2008 at 12:00:50AM -0700, Andreas Dilger wrote:
> > > AIX had SIGDANGER some 15 years ago.  Admittedly, that was sent when
> > > the system was about to hit OOM, not when it was about to start swapping.
> > 
> > I'd tried to advocate SIGDANGER some years ago as well, but none of
> > the kernel maintainers were interested.  It definitely makes sense
> > to have some sort of mechanism like this.  At the time I first brought
> > it up it was in conjunction with Netscape using too much cache on some
> > system, but it would be just as useful for all kinds of other memory-
> > hungry applications.
> 
> It's been discussed before, but I suspect the main reason why it was
> never done is no one submitted a patch.  Also, the problem is actually
> a pretty complex one.  There are a couple of different stages where
> you might want to send an alert to processes:

Isn't Marcelo, Riel and some other people working on memory
notifications?

- Arnaldo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22  3:38     ` David Chinner
  2008-01-22  4:17       ` Valdis.Kletnieks
  2008-01-22  7:05       ` Andreas Dilger
@ 2008-01-22 17:42       ` Bryan Henderson
  2 siblings, 0 replies; 29+ messages in thread
From: Bryan Henderson @ 2008-01-22 17:42 UTC (permalink / raw)
  To: David Chinner
  Cc: Andreas Dilger, linux-ext4, linux-fsdevel, linux-kernel,
	Ric Wheeler, Theodore Ts'o, Valerie Henson

>I think there is a clear need for applications to be able to
>register a callback from the kernel to indicate that the machine as
>a whole is running out of memory and that the application should
>trim it's caches to reduce memory utilisation.
>
>Perhaps instead of swapping immediately, a SIGLOWMEM could be sent ...

The problem with that approach is that the Fsck process doesn't know how 
its need for memory compares with other process' need for memory.  How 
much memory should it give up?  Maybe it should just quit altogether if 
other processes are in danger of deadlocking.  Or maybe it's best for it 
to keep all its memory and let some other frivolous process give up its 
memory instead.

It's the OS's job to have a view of the entire system and make resource 
allocation decisions.

If it's just a matter of the application choosing a better page frame to 
vacate than what the kernel would have taken, (which is more a matter of 
self-interest than resource allocation), then Fsck can do that more 
directly by just monitoring its own page fault rate.  If it's high, then 
it's using more real memory than the kernel thinks it's entitled to and it 
can reduce its memory footprint to improve its speed.  It can even check 
whether an access to readahead data caused a page fault; if so, it knows 
reading ahead is actually making things worse and therefore reduce 
readahead until the page faults stop happening.

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-22 14:40           ` Theodore Tso
  2008-01-22 14:57             ` Arnaldo Carvalho de Melo
@ 2008-01-28 19:30             ` Pavel Machek
  2008-01-28 19:56               ` Theodore Tso
  1 sibling, 1 reply; 29+ messages in thread
From: Pavel Machek @ 2008-01-28 19:30 UTC (permalink / raw)
  To: Theodore Tso, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Andreas Dilger,
	Ric Wheeler

Hi!

> It's been discussed before, but I suspect the main reason why it was
> never done is no one submitted a patch.  Also, the problem is actually
> a pretty complex one.  There are a couple of different stages where
> you might want to send an alert to processes:
> 
>     * Data is starting to get ejected from page/buffer cache
>     * System is starting to swap
>     * System is starting to really struggle to find memory
>     * System is starting an out-of-memory killer
> 
> AIX's SIGDANGER really did the last two, where the OOM killer would
> tend to avoid processes that had a SIGDANGER handler in favor of
> processes that were SIGDANGER unaware.
> 
> Then there is the additional complexity in Linux that you have
> multiple zones of memory, which at least on the historically more
> popular x86 was highly, highly important.  You could say that whenever
> there is sufficient memory pressure in any zone that you start
> ejecting data from caches or start to swap that you start sending the
> signals --- but on x86 systems with lowmem, that could happen quite
> frequently, and since a user process has no idea whether its resources
> are in lowmem or highmem, there's not much you can do about this.

As user pages are always in highmem, this should be easy to decide:
only send SIGDANGER when highmem is full. (Yes, there are
inodes/dentries/file descriptors in lowmem, but I doubt apps will
respond to SIGDANGER by closing files).
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-28 19:30             ` Pavel Machek
@ 2008-01-28 19:56               ` Theodore Tso
  2008-01-28 20:01                 ` Pavel Machek
  2008-01-29  8:29                 ` david
  0 siblings, 2 replies; 29+ messages in thread
From: Theodore Tso @ 2008-01-28 19:56 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Andreas Dilger, Ric Wheeler

On Mon, Jan 28, 2008 at 07:30:05PM +0000, Pavel Machek wrote:
> 
> As user pages are always in highmem, this should be easy to decide:
> only send SIGDANGER when highmem is full. (Yes, there are
> inodes/dentries/file descriptors in lowmem, but I doubt apps will
> respond to SIGDANGER by closing files).

Good point; for a system with at least (say) 2GB of memory, that
definitely makes sense.  For a system with less than 768 megs of
memory (how quaint, but it wasn't that long ago this was a lot of
memory :-), there wouldn't *be* any memory in highmem at all....

	       		     	 	- Ted

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-28 19:56               ` Theodore Tso
@ 2008-01-28 20:01                 ` Pavel Machek
  2008-02-03 13:51                   ` KOSAKI Motohiro
  2008-01-29  8:29                 ` david
  1 sibling, 1 reply; 29+ messages in thread
From: Pavel Machek @ 2008-01-28 20:01 UTC (permalink / raw)
  To: Theodore Tso, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Andreas Dilger,
	Ric Wheeler

On Mon 2008-01-28 14:56:33, Theodore Tso wrote:
> On Mon, Jan 28, 2008 at 07:30:05PM +0000, Pavel Machek wrote:
> > 
> > As user pages are always in highmem, this should be easy to decide:
> > only send SIGDANGER when highmem is full. (Yes, there are
> > inodes/dentries/file descriptors in lowmem, but I doubt apps will
> > respond to SIGDANGER by closing files).
> 
> Good point; for a system with at least (say) 2GB of memory, that
> definitely makes sense.  For a system with less than 768 megs of
> memory (how quaint, but it wasn't that long ago this was a lot of
> memory :-), there wouldn't *be* any memory in highmem at all....

Ok, so it is 'send SIGDANGER when all zones are low', because user
allocations can go from all zones (unless you have something really
exotic, I'm not sure if that is true on huge NUMA  machines & similar).

							Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-28 19:56               ` Theodore Tso
  2008-01-28 20:01                 ` Pavel Machek
@ 2008-01-29  8:29                 ` david
  1 sibling, 0 replies; 29+ messages in thread
From: david @ 2008-01-29  8:29 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Pavel Machek, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Andreas Dilger,
	Ric Wheeler

On Mon, 28 Jan 2008, Theodore Tso wrote:

> On Mon, Jan 28, 2008 at 07:30:05PM +0000, Pavel Machek wrote:
>>
>> As user pages are always in highmem, this should be easy to decide:
>> only send SIGDANGER when highmem is full. (Yes, there are
>> inodes/dentries/file descriptors in lowmem, but I doubt apps will
>> respond to SIGDANGER by closing files).
>
> Good point; for a system with at least (say) 2GB of memory, that
> definitely makes sense.  For a system with less than 768 megs of
> memory (how quaint, but it wasn't that long ago this was a lot of
> memory :-), there wouldn't *be* any memory in highmem at all....

not to mention machines with 1G of ram (900M lowmem, 128M highmem)

David Lang

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-28 20:01                 ` Pavel Machek
@ 2008-02-03 13:51                   ` KOSAKI Motohiro
  0 siblings, 0 replies; 29+ messages in thread
From: KOSAKI Motohiro @ 2008-02-03 13:51 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Theodore Tso, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Andreas Dilger,
	Ric Wheeler, kosaki.motohiro

Hi Pavel

> > > As user pages are always in highmem, this should be easy to decide:
> > > only send SIGDANGER when highmem is full. (Yes, there are
> > > inodes/dentries/file descriptors in lowmem, but I doubt apps will
> > > respond to SIGDANGER by closing files).
> >
> > Good point; for a system with at least (say) 2GB of memory, that
> > definitely makes sense.  For a system with less than 768 megs of
> > memory (how quaint, but it wasn't that long ago this was a lot of
> > memory :-), there wouldn't *be* any memory in highmem at all....
>
> Ok, so it is 'send SIGDANGER when all zones are low', because user
> allocations can go from all zones (unless you have something really
> exotic, I'm not sure if that is true on huge NUMA  machines & similar).

thank you good point out.

to be honest, the zone awareness of current mem_notify is premature.
I think we need enhancement rss statistics to per zone rss.
but not implemented yet ;-)

and, unfortunately I have no highmem machine.
the mem_notify is not so tested on highmem machine.

if you help to test, I am very happy!
Thanks.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-26  1:55 ` Bryan Henderson
@ 2008-01-26 13:21   ` Theodore Tso
  0 siblings, 0 replies; 29+ messages in thread
From: Theodore Tso @ 2008-01-26 13:21 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Bodo Eggert, Andreas Dilger, Andreas Dilger, Alan Cox,
	Adrian Bunk, David Chinner, linux-ext4, linux-fsdevel,
	linux-kernel, Ric Wheeler, Valerie Henson, Valdis.Kletnieks

On Fri, Jan 25, 2008 at 05:55:51PM -0800, Bryan Henderson wrote:
> I was surprised to see AIX do late allocation by default, because IBM's 
> traditional style is bulletproof systems.  A system where a process can be 
> killed at unpredictable times because of resource demands of unrelated 
> processes doesn't really fit that style.
> 
> It's really a fairly unusual application that benefits from late 
> allocation: one that creates a lot more virtual memory than it ever 
> touches.  For example, a sparse array.  Or am I missing something?

I guess it depends on how far you try to do "bulletproof".  OSF/1 used
to use "bulletproof" as its default --- and I had to turn it off on
tsx-11.mit.edu (the first North American ftp server for Linux :-),
because the difference was something like 50 ftp daemons versus over
500 on the same server.  It reserved VM space for the text segement of
every single process, since at least in theory, it's possible for
every single text page to get modified using ptrace if (for example) a
debugger were to set a break point on every single page of every
single text segement of every single ftp daemon.

You can also see potential problems for Java programs.  Suppose you
had some gigantic Java Application (say, Lotus Notes, or Websphere
Application Server) which is taking up many, many, MANY gigabytes of
VM space.  Now suppose the Java application needs to fork and exec
some trivial helper program.  For that tiny instant, between the fork
and exec, the VM requirements in "bulletproof" mode would double,
since while 99.9999% of the time programs will immediately discard the
VM upon the exec, there is always the possibility that the child
process will touch every single data page, forcing a copy on write,
and never do the exec.

There are of course different levels of "bulletproof" between the
extremes of "totally bulletproof" and "late binding" from an
algorithmic standpoint.  For example, you could ignore the needed
pages caused by ptrace(); more challenging would be to how to handle
the fork/exec semantics, although there could be kludges such as
strongly encouraging applications to use an old-fashed BSD-style
vfork() to guarantee that the child couldn't double VM requirements
between the vfork() and exec().  I certainly can't say for sure what
the AIX designers had in mind, and why they didn't choose one of the
more intermediate design choices.  

However, it is fair to say that "100% bulletproof" can require
reserving far more VM resources than you might first expect.  Even a
company which is highly incented to sell large amounts of hardware,
such as Digital, might not have wanted their OS to be only able to
support an embarassingly small number of simultaneous ftpd
connections.  I know this for sure because the OSF/1 documentation,
when discussing their VM tuning knobs, specifically talked about the
scenario that I ran into with tsx-11.mit.edu.

Regards,

						- Ted

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-24 23:40                 ` Theodore Tso
  2008-01-25  0:25                   ` Zan Lynx
@ 2008-01-26 12:32                   ` KOSAKI Motohiro
  1 sibling, 0 replies; 29+ messages in thread
From: KOSAKI Motohiro @ 2008-01-26 12:32 UTC (permalink / raw)
  To: Theodore Tso, Adrian Bunk, Bodo Eggert, Alan Cox, Andreas Dilger,
	Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Andreas Dilger, Ric Wheeler

> > And from a performance point of view letting applications voluntarily
> > free some memory is better even than starting to swap.
>
> Absolutely.

the mem_notify patch can realize "just before starting swapping" notification :)

to be honest, I don't know fs guys requirement.
if lacking feature of fs guys needed, I implement it with presure if
you tell me it.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-26  0:55                       ` Zan Lynx
@ 2008-01-26 11:56                         ` KOSAKI Motohiro
  0 siblings, 0 replies; 29+ messages in thread
From: KOSAKI Motohiro @ 2008-01-26 11:56 UTC (permalink / raw)
  To: Zan Lynx, Andreas Dilger
  Cc: Theodore Tso, Adrian Bunk, Bodo Eggert, Alan Cox,
	Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Ric Wheeler

> The commentary on the mem_notify threads claimed that the signal is
> easily provided by setting up the file handle for SIGIO.

BTW:
Of cource, you can receive any signal instead SIGIO by use fcntl(F_SETSIG)  :-)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
       [not found] <alpine.LSU.0.999.0801252338460.26260@be1.lrz>
@ 2008-01-26  1:55 ` Bryan Henderson
  2008-01-26 13:21   ` Theodore Tso
  0 siblings, 1 reply; 29+ messages in thread
From: Bryan Henderson @ 2008-01-26  1:55 UTC (permalink / raw)
  To: Bodo Eggert
  Cc: Bodo Eggert, Andreas Dilger, Andreas Dilger, Alan Cox,
	Adrian Bunk, David Chinner, linux-ext4, linux-fsdevel,
	linux-kernel, Ric Wheeler, Theodore Tso, Valerie Henson,
	Valdis.Kletnieks

>> Incidentally, some context for the AIX approach to the OOM problem: a 
>> process may exclude itself from OOM vulnerability altogether.  It 
places 
>> itself in "early allocation" mode, which means at the time it creates 
>> virtual memory, it reserves enough backing store for the worst case. 
The 
>> memory manager does not send such a process the SIGDANGER signal or 
>> terminate it when it runs out of paging space.  Before c. 2000, this 
was 
>> the only mode.  Now the default is late allocation mode, which is 
similar 
>> to Linux.
>
>This is an interesting approach. It feels like some programs might be 
>interested in choosing this mode instead of risking OOM. 

It's the way virtual memory always worked when it was first invented.  The 
system not only reserved space to back every page of virtual memory; it 
assigned the particular blocks for it.  Late allocation was a later 
innovation, and I believe its main goal was to make it possible to use the 
cheaper disk drives for paging instead of drums.  Late allocation gives 
you better locality on disk, so the seeking doesn't eat you alive (drums 
don't seek).  Even then, I assume (but am not sure) that the system at 
least reserved the space in an account somewhere so at pageout time there 
was guaranteed to be a place to which to page out.  Overcommitting page 
space to save on disk space was a later idea.

I was surprised to see AIX do late allocation by default, because IBM's 
traditional style is bulletproof systems.  A system where a process can be 
killed at unpredictable times because of resource demands of unrelated 
processes doesn't really fit that style.

It's really a fairly unusual application that benefits from late 
allocation: one that creates a lot more virtual memory than it ever 
touches.  For example, a sparse array.  Or am I missing something?

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-25 11:09                     ` Andreas Dilger
@ 2008-01-26  0:55                       ` Zan Lynx
  2008-01-26 11:56                         ` KOSAKI Motohiro
  0 siblings, 1 reply; 29+ messages in thread
From: Zan Lynx @ 2008-01-26  0:55 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Theodore Tso, Adrian Bunk, Bodo Eggert, Alan Cox,
	Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Ric Wheeler

[-- Attachment #1: Type: text/plain, Size: 1145 bytes --]


On Fri, 2008-01-25 at 04:09 -0700, Andreas Dilger wrote:
> On Jan 24, 2008  17:25 -0700, Zan Lynx wrote:
> > Have y'all been following the /dev/mem_notify patches?
> > http://article.gmane.org/gmane.linux.kernel/628653
> 
> Having the notification be via poll() is a very restrictive processing
> model.  Having the notification be via a signal means that any kind of
> process (and not just those that are event loop driven) can register
> a callback at some arbitrary point in the code and be notified.  I
> don't object to the poll() interface, but it would be good to have a
> signal mechanism also.

The commentary on the mem_notify threads claimed that the signal is
easily provided by setting up the file handle for SIGIO.

Yeah.  Here it is...copied from email written by KOSAKI Motohiro:

implement FASYNC capability to /dev/mem_notify.

<usage example>
        fd = open("/dev/mem_notify", O_RDONLY);

        fcntl(fd, F_SETOWN, getpid());

        flags = fcntl(fd, F_GETFL);
        fcntl(fd, F_SETFL, flags|FASYNC);  /* when low memory, receive SIGIO */
</usage example>
-- 
Zan Lynx <zlynx@acm.org>

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-25  0:25                   ` Zan Lynx
@ 2008-01-25 11:09                     ` Andreas Dilger
  2008-01-26  0:55                       ` Zan Lynx
  0 siblings, 1 reply; 29+ messages in thread
From: Andreas Dilger @ 2008-01-25 11:09 UTC (permalink / raw)
  To: Zan Lynx
  Cc: Theodore Tso, Adrian Bunk, Bodo Eggert, Alan Cox,
	Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Ric Wheeler

On Jan 24, 2008  17:25 -0700, Zan Lynx wrote:
> Have y'all been following the /dev/mem_notify patches?
> http://article.gmane.org/gmane.linux.kernel/628653

Having the notification be via poll() is a very restrictive processing
model.  Having the notification be via a signal means that any kind of
process (and not just those that are event loop driven) can register
a callback at some arbitrary point in the code and be notified.  I
don't object to the poll() interface, but it would be good to have a
signal mechanism also.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-24 23:40                 ` Theodore Tso
@ 2008-01-25  0:25                   ` Zan Lynx
  2008-01-25 11:09                     ` Andreas Dilger
  2008-01-26 12:32                   ` KOSAKI Motohiro
  1 sibling, 1 reply; 29+ messages in thread
From: Zan Lynx @ 2008-01-25  0:25 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Adrian Bunk, Bodo Eggert, Alan Cox, Andreas Dilger,
	Valdis.Kletnieks, David Chinner, Valerie Henson, linux-fsdevel,
	linux-ext4, linux-kernel, Andreas Dilger, Ric Wheeler

[-- Attachment #1: Type: text/plain, Size: 1079 bytes --]


On Thu, 2008-01-24 at 18:40 -0500, Theodore Tso wrote:
> On Fri, Jan 25, 2008 at 01:08:09AM +0200, Adrian Bunk wrote:
> > In practice, there is a small number of programs that are both the
> > common memory hogs and should be able to reduce their memory consumption
> > by 10% or 20% without big problems when requested (e.g. Java VMs,
> > Firefox and databases come into my mind).
> 
> I agree, it's only a few processes where this makes sense.  But for
> those that do, it would be useful if they could register with the
> kernel that would like to know, (just before the system starts
> ejecting cached data, just before swapping, etc.) and at what
> frequency.  And presumably, if the kernel notices that a process is
> responding to such requests with memory actually getting released back
> to the system, that process could get "rewarded" by having the OOM
> killer less likely to target that particular thread.

Have y'all been following the /dev/mem_notify patches?
http://article.gmane.org/gmane.linux.kernel/628653

-- 
Zan Lynx <zlynx@acm.org>

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-24 23:08               ` Adrian Bunk
@ 2008-01-24 23:40                 ` Theodore Tso
  2008-01-25  0:25                   ` Zan Lynx
  2008-01-26 12:32                   ` KOSAKI Motohiro
  0 siblings, 2 replies; 29+ messages in thread
From: Theodore Tso @ 2008-01-24 23:40 UTC (permalink / raw)
  To: Adrian Bunk
  Cc: Bodo Eggert, Alan Cox, Andreas Dilger, Valdis.Kletnieks,
	David Chinner, Valerie Henson, linux-fsdevel, linux-ext4,
	linux-kernel, Andreas Dilger, Ric Wheeler

On Fri, Jan 25, 2008 at 01:08:09AM +0200, Adrian Bunk wrote:
> In practice, there is a small number of programs that are both the
> common memory hogs and should be able to reduce their memory consumption
> by 10% or 20% without big problems when requested (e.g. Java VMs,
> Firefox and databases come into my mind).

I agree, it's only a few processes where this makes sense.  But for
those that do, it would be useful if they could register with the
kernel that would like to know, (just before the system starts
ejecting cached data, just before swapping, etc.) and at what
frequency.  And presumably, if the kernel notices that a process is
responding to such requests with memory actually getting released back
to the system, that process could get "rewarded" by having the OOM
killer less likely to target that particular thread.

AIX basically did this with SIGDANGER (the signal is ignored by
default), except there wasn't the ability for the process to tell the
kernel at what level of memory pressure before it should start getting
notified, and there was no way for the kernel to tell how bad the
memory pressure actually was.  On the other hand, it was a relatively
simple design.

In practice very few processes would indeed pay attention to
SIGDANGER, so I think you're quite right there.

> And from a performance point of view letting applications voluntarily 
> free some memory is better even than starting to swap.

Absolutely.

						- Ted

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-24 17:32             ` Bodo Eggert
  2008-01-24 22:07               ` Andreas Dilger
@ 2008-01-24 23:08               ` Adrian Bunk
  2008-01-24 23:40                 ` Theodore Tso
  1 sibling, 1 reply; 29+ messages in thread
From: Adrian Bunk @ 2008-01-24 23:08 UTC (permalink / raw)
  To: Bodo Eggert
  Cc: Alan Cox, Andreas Dilger, Valdis.Kletnieks, David Chinner,
	Valerie Henson, linux-fsdevel, linux-ext4, linux-kernel,
	Theodore Ts'o, Andreas Dilger, Ric Wheeler

On Thu, Jan 24, 2008 at 06:32:15PM +0100, Bodo Eggert wrote:
> Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> 
> >> I'd tried to advocate SIGDANGER some years ago as well, but none of
> >> the kernel maintainers were interested.  It definitely makes sense
> >> to have some sort of mechanism like this.  At the time I first brought
> >> it up it was in conjunction with Netscape using too much cache on some
> >> system, but it would be just as useful for all kinds of other memory-
> >> hungry applications.
> > 
> > There is an early thread for a /proc file which you can add to your
> > poll() set and it will wake people when memory is low. Very elegant and
> > if async support is added it will also give you the signal variant for
> > free.
> 
> IMO you'll need a userspace daemon. The kernel does only know about the
> amount of memory available / recommended for a system (or container),
> while the user knows which program's cache is most precious today.
> 
> (Off cause the userspace daemon will in turn need the /proc file.)
> 
> I think a single, system-wide signal is the second-to worst solution: All
> applications (or the wrong one, if you select one) would free their caches
> and start to crawl, and either stay in this state or slowly increase their
> caches again until they get signaled again. And the signal would either
> come too early or too late. The userspace daemon could collect the weighted
> demand of memory from all applications and tell them how much to use.

I don't think that's something that would require finetuning on a
per-application basis - the kernel should tell all applications once to
reduce memory consumption and write a fat warning to the logs (which
will on well-maintained systems be mailed to the admin).

Your "and tell them how much to use" wouldn't work for most applications 
- e.g. I've worked the last weeks with a computer with 512 MB RAM and no 
Swap, which means usually only 200 MB of free RAM. I've gotten quite 
used to git aborting with "fatal: Out of memory, malloc failed" when 
200 MB weren't enough for git, and I don't think there is any reasonable 
way for git to reduce the memory usage while continuing to run.

In practice, there is a small number of programs that are both the
common memory hogs and should be able to reduce their memory consumption
by 10% or 20% without big problems when requested (e.g. Java VMs,
Firefox and databases come into my mind).

And from a performance point of view letting applications voluntarily 
free some memory is better even than starting to swap.

cu
Adrian

-- 

       "Is there not promise of rain?" Ling Tan asked suddenly out
        of the darkness. There had been need of rain for many days.
       "Only a promise," Lao Er said.
                                       Pearl S. Buck - Dragon Seed


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
  2008-01-24 17:32             ` Bodo Eggert
@ 2008-01-24 22:07               ` Andreas Dilger
  2008-01-24 23:08               ` Adrian Bunk
  1 sibling, 0 replies; 29+ messages in thread
From: Andreas Dilger @ 2008-01-24 22:07 UTC (permalink / raw)
  To: Bodo Eggert
  Cc: Alan Cox, Valdis.Kletnieks, David Chinner, Valerie Henson,
	linux-fsdevel, linux-ext4, linux-kernel, Theodore Ts'o,
	Andreas Dilger, Ric Wheeler

On Jan 24, 2008  18:32 +0100, Bodo Eggert wrote:
> I think a single, system-wide signal is the second-to worst solution: All
> applications (or the wrong one, if you select one) would free their caches
> and start to crawl, and either stay in this state or slowly increase their
> caches again until they get signaled again. And the signal would either
> come too early or too late. The userspace daemon could collect the weighted
> demand of memory from all applications and tell them how much to use.

Well, sending a few signals (maybe to the top 5 processes in the OOM killer
list) is still a LOT better than OOM-killing them without warning...  That
way important system processes could be taught to understand SIGDANGER and
maybe do something about it instead of being killed, and if Firefox and
other memory hungry processes flush some of their cache it is not fatal.

I wouldn't think that SIGDANGER means "free all of your cache", since the
memory usage clearly wasn't a problem a few seconds previously, so as
an application writer I'd code it as "flush the oldest 10% of my cache"
or similar, and the kernel could send SIGDANGER again (or kill the real
offender) if the memory usage again becomes an issue.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC] Parallelize IO for e2fsck
       [not found]           ` <9Orda-3ub-45@gated-at.bofh.it>
@ 2008-01-24 17:32             ` Bodo Eggert
  2008-01-24 22:07               ` Andreas Dilger
  2008-01-24 23:08               ` Adrian Bunk
  0 siblings, 2 replies; 29+ messages in thread
From: Bodo Eggert @ 2008-01-24 17:32 UTC (permalink / raw)
  To: Alan Cox, Andreas Dilger, Valdis.Kletnieks, David Chinner,
	Valerie Henson, linux-fsdevel, linux-ext4, linux-kernel,
	Theodore Ts'o, Andreas Dilger, Ric Wheeler

Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

>> I'd tried to advocate SIGDANGER some years ago as well, but none of
>> the kernel maintainers were interested.  It definitely makes sense
>> to have some sort of mechanism like this.  At the time I first brought
>> it up it was in conjunction with Netscape using too much cache on some
>> system, but it would be just as useful for all kinds of other memory-
>> hungry applications.
> 
> There is an early thread for a /proc file which you can add to your
> poll() set and it will wake people when memory is low. Very elegant and
> if async support is added it will also give you the signal variant for
> free.

IMO you'll need a userspace daemon. The kernel does only know about the
amount of memory available / recommended for a system (or container),
while the user knows which program's cache is most precious today.

(Off cause the userspace daemon will in turn need the /proc file.)

I think a single, system-wide signal is the second-to worst solution: All
applications (or the wrong one, if you select one) would free their caches
and start to crawl, and either stay in this state or slowly increase their
caches again until they get signaled again. And the signal would either
come too early or too late. The userspace daemon could collect the weighted
demand of memory from all applications and tell them how much to use.


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2008-02-03 13:51 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <70b6f0bf0801161322k2740a8dch6a0d6e6e112cd2d0@mail.gmail.com>
2008-01-16 21:30 ` [RFC] Parallelize IO for e2fsck Valerie Henson
2008-01-18  1:15   ` David Chinner
2008-01-18  1:43     ` Valerie Henson
2008-01-21 23:00   ` Andreas Dilger
2008-01-22  3:38     ` David Chinner
2008-01-22  4:17       ` Valdis.Kletnieks
2008-01-22  7:00         ` Andreas Dilger
2008-01-22 13:05           ` Alan Cox
2008-01-22 14:40           ` Theodore Tso
2008-01-22 14:57             ` Arnaldo Carvalho de Melo
2008-01-28 19:30             ` Pavel Machek
2008-01-28 19:56               ` Theodore Tso
2008-01-28 20:01                 ` Pavel Machek
2008-02-03 13:51                   ` KOSAKI Motohiro
2008-01-29  8:29                 ` david
2008-01-22  7:05       ` Andreas Dilger
2008-01-22  8:16         ` David Chinner
2008-01-22 17:42       ` Bryan Henderson
     [not found] <9Mo9w-7Ws-25@gated-at.bofh.it>
     [not found] ` <9Mo9w-7Ws-23@gated-at.bofh.it>
     [not found]   ` <9OdWm-7uN-25@gated-at.bofh.it>
     [not found]     ` <9Oi9A-5EJ-3@gated-at.bofh.it>
     [not found]       ` <9OiMg-6IC-1@gated-at.bofh.it>
     [not found]         ` <9OlqL-2xG-3@gated-at.bofh.it>
     [not found]           ` <9Orda-3ub-45@gated-at.bofh.it>
2008-01-24 17:32             ` Bodo Eggert
2008-01-24 22:07               ` Andreas Dilger
2008-01-24 23:08               ` Adrian Bunk
2008-01-24 23:40                 ` Theodore Tso
2008-01-25  0:25                   ` Zan Lynx
2008-01-25 11:09                     ` Andreas Dilger
2008-01-26  0:55                       ` Zan Lynx
2008-01-26 11:56                         ` KOSAKI Motohiro
2008-01-26 12:32                   ` KOSAKI Motohiro
     [not found] <alpine.LSU.0.999.0801252338460.26260@be1.lrz>
2008-01-26  1:55 ` Bryan Henderson
2008-01-26 13:21   ` Theodore Tso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).