LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: neilb@suse.de, linux-raid@vger.kernel.org
Cc: akpm@osdl.org, linux-kernel@vger.kernel.org, christopher.leech@intel.com
Subject: [PATCH 04/19] raid5: move compute block operations to a workqueue
Date: Mon, 11 Sep 2006 16:17:56 -0700	[thread overview]
Message-ID: <20060911231756.4737.91223.stgit@dwillia2-linux.ch.intel.com> (raw)
In-Reply-To: <1158015632.4241.31.camel@dwillia2-linux.ch.intel.com>

From: Dan Williams <dan.j.williams@intel.com>

Enable handle_stripe5 to pass off compute block operations to
raid5_do_soft_block_ops, formerly handled by compute_block.

Here are a few notes about the new flags R5_ComputeReq and
STRIPE_OP_COMPUTE_Recover:

Previously, when handle_stripe5 found a block that needed to be computed it
updated it in the same step.  Now that these operations are separated
(across multiple calls to handle_stripe5), a R5_ComputeReq flag is needed
to tell other parts of handle_stripe5 to treat the block under computation
as if it were up to date.  The order of events in the work queue ensures that the
block is indeed up to date before performing further operations.

STRIPE_OP_COMPUTE_Recover_pd was added to track when the parity block is being
computed due to a failed parity check.  This allows the code in
handle_stripe5 that produces requests for check_parity and compute_block
operations to be separate from the code that consumes the result.

Changelog:
* count blocks under computation as uptodate
* removed handle_compute_operations5.  All logic moved into handle_stripe5
so that we do not need to go through the initiation logic to end the
operation.
* since the write operations mark blocks !uptodate we hold off the code to
compute/read blocks until it completes.
* new compute block operations and reads are held off while a compute is in
flight
* do not compute a block while a check parity operation is pending, and do
not start a new check parity operation while a compute operation is pending
* STRIPE_OP_Recover_pd holds off the clearing of the STRIPE_OP_COMPUTE state.
This allows the transition to be handled by the check parity logic that
writes recomputed parity to disk.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

 drivers/md/raid5.c |  153 ++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 107 insertions(+), 46 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24ed4d8..0c39203 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1300,7 +1300,8 @@ static int handle_write_operations5(stru
 		}
 	} else {
 		/* enter stage 1 of read modify write operation */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+			test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags)));
 
 		set_bit(STRIPE_OP_RMW, &sh->state);
 		set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
@@ -1314,7 +1315,8 @@ static int handle_write_operations5(stru
 			 * so we distinguish these blocks by the RMWReq bit
 			 */
 			if (dev->towrite &&
-			    test_bit(R5_UPTODATE, &dev->flags)) {
+			    (test_bit(R5_UPTODATE, &dev->flags) ||
+			    test_bit(R5_ComputeReq, &dev->flags))) {
 				set_bit(R5_RMWReq, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
@@ -1748,7 +1750,7 @@ static void handle_stripe5(struct stripe
 	int i;
 	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite=0, write_complete=0;
+	int compute=0, non_overwrite=0, write_complete=0;
 	int failed_num=0;
 	struct r5dev *dev;
 
@@ -1799,7 +1801,7 @@ static void handle_stripe5(struct stripe
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
-
+		if (test_bit(R5_ComputeReq, &dev->flags)) BUG_ON(++compute > 1);
 		
 		if (dev->toread) to_read++;
 		if (dev->towrite) {
@@ -1955,40 +1957,83 @@ static void handle_stripe5(struct stripe
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     expanding ||
-			     (failed && (sh->dev[failed_num].toread ||
-					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
+	if (to_read || non_overwrite || (syncing && (uptodate + compute < disks)) || expanding ||
+		test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
+		/* Finish any pending compute operations.  Parity recovery implies
+		 * a write-back which is handled later on in this routine
+		 */
+		if (test_bit(STRIPE_OP_COMPUTE, &sh->state) &&
+			test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state) &&
+			!test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) {
+			clear_bit(STRIPE_OP_COMPUTE, &sh->state);
+			clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state);
+		}
+		
+		/* blocks being written are temporarily !UPTODATE */
+		if (!test_bit(STRIPE_OP_COMPUTE, &sh->state) &&
+			!test_bit(STRIPE_OP_RCW, &sh->state) &&
+			!test_bit(STRIPE_OP_RMW, &sh->state)) {
+			for (i=disks; i--;) {
+				dev = &sh->dev[i];
+
+				/* don't schedule compute operations or reads on
+				 * the parity block while a check is in flight
 				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing block %d\n", i);
-					compute_block(sh, i);
-					uptodate++;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
+				if ((i == sh->pd_idx) && test_bit(STRIPE_OP_CHECK, &sh->state))
+					continue;
+
+				if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				     (dev->toread ||
+				     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+				     syncing ||
+				     expanding ||
+				     (failed && (sh->dev[failed_num].toread ||
+						 (sh->dev[failed_num].towrite &&
+						 	!test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
+					    )
+					) {
+					/* 1/ We would like to get this block, possibly
+					 * by computing it, but we might not be able to.
+					 *
+					 * 2/ Since parity check operations make the parity
+					 * block !uptodate it will need to be refreshed
+					 * before any compute operations on data disks are
+					 * scheduled.
+					 *
+					 * 3/ We hold off parity block re-reads until check
+					 * operations have quiesced.
+					 */
+					if ((uptodate == disks-1) && !test_bit(STRIPE_OP_CHECK, &sh->state)) {
+						set_bit(STRIPE_OP_COMPUTE, &sh->state);
+						set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state);
+						set_bit(R5_ComputeReq, &dev->flags);
+						sh->ops.pending++;
+						/* Careful: from this point on 'uptodate' is in the eye of the
+						 * workqueue which services 'compute' operations before writes.
+						 * R5_ComputeReq flags blocks that will be R5_UPTODATE
+						 * in the work queue.
+						 */
+						uptodate++;
+					} else if ((uptodate < disks-1) && test_bit(R5_Insync, &dev->flags)) {
+						/* Note: we hold off compute operations while checks are in flight,
+						 * but we still prefer 'compute' over 'read' hence we only read if
+						 * (uptodate < disks-1)
+						 */
+						set_bit(R5_LOCKED, &dev->flags);
+						set_bit(R5_Wantread, &dev->flags);
 #if 0
-					/* if I am just reading this block and we don't have
-					   a failed drive, or any pending writes then sidestep the cache */
-					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
-					    ! syncing && !failed && !to_write) {
-						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
-						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
-					}
+						/* if I am just reading this block and we don't have
+						   a failed drive, or any pending writes then sidestep the cache */
+						if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+						    ! syncing && !failed && !to_write) {
+							sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+							sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+						}
 #endif
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n", 
-						i, syncing);
+						locked++;
+						PRINTK("Reading block %d (sync=%d)\n", 
+							i, syncing);
+					}
 				}
 			}
 		}
@@ -2055,7 +2100,7 @@ #if 0
 || sh->bh_page[i]!=bh->b_page
 #endif
 				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) {
 				if (test_bit(R5_Insync, &dev->flags)
 /*				    && !(!mddev->insync && i == sh->pd_idx) */
 					)
@@ -2069,7 +2114,7 @@ #if 0
 || sh->bh_page[i] != bh->b_page
 #endif
 				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) {
 				if (test_bit(R5_Insync, &dev->flags)) rcw++;
 				else rcw += 2*disks;
 			}
@@ -2082,7 +2127,8 @@ #endif
 			for (i=disks; i--;) {
 				dev = &sh->dev[i];
 				if ((dev->towrite || i == sh->pd_idx) &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) &&
 				    test_bit(R5_Insync, &dev->flags)) {
 					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 					{
@@ -2101,7 +2147,8 @@ #endif
 			for (i=disks; i--;) {
 				dev = &sh->dev[i];
 				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) &&
 				    test_bit(R5_Insync, &dev->flags)) {
 					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 					{
@@ -2127,16 +2174,19 @@ #endif
 	 * 2/ Hold off parity checks while parity dependent operations are in flight
 	 *    (RCW and RMW are protected by 'locked')
 	 */
-	if ((syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state)) ||
-	    	test_bit(STRIPE_OP_CHECK, &sh->state)) {
+	if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE, &sh->state) &&
+		!test_bit(STRIPE_INSYNC, &sh->state)) ||
+	    	test_bit(STRIPE_OP_CHECK, &sh->state) ||
+	    	test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) {
 
 		set_bit(STRIPE_HANDLE, &sh->state);
 		/* Take one of the following actions:
 		 * 1/ start a check parity operation if (uptodate == disks)
 		 * 2/ finish a check parity operation and act on the result
+		 * 3/ skip to the writeback section if we previously 
+		 *    initiated a recovery operation
 		 */
-		if (failed == 0) {
+		if (failed == 0 && !test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) {
 			if (!test_bit(STRIPE_OP_CHECK, &sh->state)) {
 				BUG_ON(uptodate != disks);
 				set_bit(STRIPE_OP_CHECK, &sh->state);
@@ -2157,18 +2207,29 @@ #endif
 						/* don't try to repair!! */
 						set_bit(STRIPE_INSYNC, &sh->state);
 					else {
-						compute_block(sh, sh->pd_idx);
+						set_bit(STRIPE_OP_COMPUTE, &sh->state);
+						set_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state);
+						set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state);
+						set_bit(R5_ComputeReq, &sh->dev[sh->pd_idx].flags);
+						sh->ops.pending++;
 						uptodate++;
 					}
 				}
 			}
 		}
+		if (test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state) &&
+			test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) {
+			clear_bit(STRIPE_OP_COMPUTE, &sh->state);
+			clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state);
+			clear_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state);
+		}
 
-		/* Wait for check parity operations to complete
+		/* Wait for check parity and compute block operations to complete
 		 * before write-back
 		 */
 		if (!test_bit(STRIPE_INSYNC, &sh->state) &&
-			!test_bit(STRIPE_OP_CHECK, &sh->state)) {
+			!test_bit(STRIPE_OP_CHECK, &sh->state) &&
+			!test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
 
 			/* either failed parity check, or recovery is happening */
 			if (failed==0)

  parent reply	other threads:[~2006-09-11 23:18 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-09-11 23:00 [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction Dan Williams
2006-09-11 23:17 ` [PATCH 01/19] raid5: raid5_do_soft_block_ops Dan Williams
2006-09-11 23:34   ` Jeff Garzik
2006-09-11 23:17 ` [PATCH 02/19] raid5: move write operations to a workqueue Dan Williams
2006-09-11 23:36   ` Jeff Garzik
2006-09-11 23:17 ` [PATCH 03/19] raid5: move check parity " Dan Williams
2006-09-11 23:17 ` Dan Williams [this message]
2006-09-11 23:18 ` [PATCH 05/19] raid5: move read completion copies " Dan Williams
2006-09-11 23:18 ` [PATCH 06/19] raid5: move the reconstruct write expansion operation " Dan Williams
2006-09-11 23:18 ` [PATCH 07/19] raid5: remove compute_block and compute_parity5 Dan Williams
2006-09-11 23:18 ` [PATCH 08/19] dmaengine: enable multiple clients and operations Dan Williams
2006-09-11 23:44   ` Jeff Garzik
2006-09-12  0:14     ` Dan Williams
2006-09-12  0:52       ` Roland Dreier
2006-09-12  6:18         ` Dan Williams
2006-09-12  9:15           ` Evgeniy Polyakov
2006-09-13  4:04           ` Jeff Garzik
2006-09-15 16:38     ` Olof Johansson
2006-09-15 19:44       ` [PATCH] dmaengine: clean up and abstract function types (was Re: [PATCH 08/19] dmaengine: enable multiple clients and operations) Olof Johansson
2006-09-15 20:02         ` [PATCH] [v2] " Olof Johansson
2006-09-18 22:56         ` [PATCH] " Dan Williams
2006-09-19  1:05           ` Olof Johansson
2006-09-19 11:20             ` Alan Cox
2006-09-19 16:32               ` Olof Johansson
2006-09-11 23:18 ` [PATCH 09/19] dmaengine: reduce backend address permutations Dan Williams
2006-09-15 14:46   ` Olof Johansson
2006-09-11 23:18 ` [PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients Dan Williams
2006-09-11 23:18 ` [PATCH 11/19] dmaengine: add memset as an asynchronous dma operation Dan Williams
2006-09-11 23:50   ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy Dan Williams
2006-09-11 23:51   ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 13/19] dmaengine: add support for dma xor zero sum operations Dan Williams
2006-09-11 23:18 ` [PATCH 14/19] dmaengine: add dma_sync_wait Dan Williams
2006-09-11 23:52   ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 15/19] dmaengine: raid5 dma client Dan Williams
2006-09-11 23:54   ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines Dan Williams
2006-09-15 14:57   ` Olof Johansson
2006-09-11 23:19 ` [PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs Dan Williams
2006-09-11 23:55   ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization Dan Williams
2006-09-11 23:56   ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver Dan Williams
2006-09-11 23:38 ` [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction Jeff Garzik
2006-09-11 23:53   ` Dan Williams
2006-09-12  2:41     ` Jeff Garzik
2006-09-12  5:47       ` Dan Williams
2006-09-13  4:05         ` Jeff Garzik
2006-09-13  7:15 ` Jakob Oestergaard
2006-09-13 19:17   ` Dan Williams
2006-09-14  7:42     ` Jakob Oestergaard
2006-10-11  1:46       ` Dan Williams
2006-10-08 22:18 ` Neil Brown
2006-10-10 18:23   ` Dan Williams
2006-10-11  2:44     ` Neil Brown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060911231756.4737.91223.stgit@dwillia2-linux.ch.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=akpm@osdl.org \
    --cc=christopher.leech@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    --subject='Re: [PATCH 04/19] raid5: move compute block operations to a workqueue' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).