LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/2] md: dm-verity: aggregate crypto API calls
@ 2018-03-25 18:41 Yael Chemla
  2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
  2018-03-26  6:31 ` [PATCH 1/2] md: dm-verity: aggregate crypto API calls Gilad Ben-Yossef
  0 siblings, 2 replies; 14+ messages in thread
From: Yael Chemla @ 2018-03-25 18:41 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, dm-devel, linux-kernel,
	ofir.drang, Yael Chemla
  Cc: Yael Chemla

 Current implementation makes multiple crypto API calls per block
 implementation makes multiple crypto API calls per single block, 
 forcing underlying crypto tfm implementation to "stop & go",
 leading to under utilization of HW engines.
 To fix it unify calls to crypto init/update/final into a digest call
 with a single sg which contains multiple buffers.
 This is also needed as an enabler for the next patch in the series.

 
Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
---
 drivers/md/dm-verity-target.c | 220 ++++++++++++++++++++++++------------------
 1 file changed, 127 insertions(+), 93 deletions(-)

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index aedb822..a281b83 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -35,10 +35,18 @@
 
 #define DM_VERITY_OPTS_MAX		(2 + DM_VERITY_OPTS_FEC)
 
+/* only two elements in static scatter list: salt and data */
+#define SG_FIXED_ITEMS	2
+
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
+enum salt_location {
+	START_SG,
+	END_SG
+};
+
 struct dm_verity_prefetch_work {
 	struct work_struct work;
 	struct dm_verity *v;
@@ -92,82 +100,68 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 	return block >> (level * v->hash_per_block_bits);
 }
 
-static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
-				const u8 *data, size_t len,
-				struct crypto_wait *wait)
-{
-	struct scatterlist sg;
-
-	sg_init_one(&sg, data, len);
-	ahash_request_set_crypt(req, &sg, NULL, len);
-
-	return crypto_wait_req(crypto_ahash_update(req), wait);
-}
-
 /*
- * Wrapper for crypto_ahash_init, which handles verity salting.
+ * verity_is_salt_required - check if according to verity version and
+ * verity salt's size if there's a need to insert a salt.
+ * note: salt goes last for 0th version and first for all others
+ * @where - START_SG - before buffer / END_SG - after buffer
  */
-static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
-				struct crypto_wait *wait)
+static inline bool verity_is_salt_required(struct dm_verity *v,
+		enum salt_location where)
 {
-	int r;
+	/* No salt, no problem */
+	if (likely(!v->salt_size))
+		return false;
 
-	ahash_request_set_tfm(req, v->tfm);
-	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-					CRYPTO_TFM_REQ_MAY_BACKLOG,
-					crypto_req_done, (void *)wait);
-	crypto_init_wait(wait);
-
-	r = crypto_wait_req(crypto_ahash_init(req), wait);
-
-	if (unlikely(r < 0)) {
-		DMERR("crypto_ahash_init failed: %d", r);
-		return r;
-	}
-
-	if (likely(v->salt_size && (v->version >= 1)))
-		r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
-
-	return r;
+	if (likely(v->version))
+		return (where == START_SG);
+	else
+		return (where == END_SG);
 }
 
-static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
-			     u8 *digest, struct crypto_wait *wait)
+/*
+ * verity_add_salt - add verity's salt into a scatterlist
+ * @nents - number of elements already inserted into sg
+ * @total_len - total number of items in scatterlist array
+ */
+static void verity_add_salt(struct dm_verity *v, struct scatterlist *sg,
+	unsigned int *nents, unsigned int *total_len)
 {
-	int r;
-
-	if (unlikely(v->salt_size && (!v->version))) {
-		r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
-
-		if (r < 0) {
-			DMERR("verity_hash_final failed updating salt: %d", r);
-			goto out;
-		}
-	}
-
-	ahash_request_set_crypt(req, NULL, digest, 0);
-	r = crypto_wait_req(crypto_ahash_final(req), wait);
-out:
-	return r;
+	sg_set_buf(&sg[*nents], v->salt, v->salt_size);
+	(*nents)++;
+	(*total_len) += v->salt_size;
 }
 
 int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		const u8 *data, size_t len, u8 *digest)
 {
-	int r;
+	int r, total_len = 0, indx = 0;
+	struct scatterlist sg[SG_FIXED_ITEMS];
 	struct crypto_wait wait;
 
-	r = verity_hash_init(v, req, &wait);
-	if (unlikely(r < 0))
-		goto out;
+	sg_init_table(sg, SG_FIXED_ITEMS);
+	ahash_request_set_tfm(req, v->tfm);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					crypto_req_done, (void *)&wait);
+	if (verity_is_salt_required(v, START_SG))
+		verity_add_salt(v, sg, &indx, &total_len);
 
-	r = verity_hash_update(v, req, data, len, &wait);
-	if (unlikely(r < 0))
-		goto out;
+	sg_set_buf(&sg[indx], data, len);
+	indx++;
+	total_len += len;
+	if (verity_is_salt_required(v, END_SG))
+		verity_add_salt(v, sg, &indx, &total_len);
 
-	r = verity_hash_final(v, req, digest, &wait);
+	ahash_request_set_crypt(req, sg, digest, len + v->salt_size);
+	crypto_init_wait(&wait);
 
-out:
+	r = crypto_wait_req(crypto_ahash_digest(req), &wait);
+
+	if (unlikely(r < 0)) {
+		DMERR("crypto_ahash_digest failed: %d", r);
+		return r;
+	}
 	return r;
 }
 
@@ -347,44 +341,54 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 /*
  * Calculates the digest for the given bio
  */
-int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
-			struct bvec_iter *iter, struct crypto_wait *wait)
+void verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
+			struct bvec_iter *iter, struct scatterlist *sg,
+			unsigned int *nents, unsigned int *total_len)
 {
 	unsigned int todo = 1 << v->data_dev_block_bits;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
-	struct scatterlist sg;
-	struct ahash_request *req = verity_io_hash_req(v, io);
 
 	do {
-		int r;
 		unsigned int len;
 		struct bio_vec bv = bio_iter_iovec(bio, *iter);
 
-		sg_init_table(&sg, 1);
-
 		len = bv.bv_len;
 
 		if (likely(len >= todo))
 			len = todo;
-		/*
-		 * Operating on a single page at a time looks suboptimal
-		 * until you consider the typical block size is 4,096B.
-		 * Going through this loops twice should be very rare.
-		 */
-		sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
-		ahash_request_set_crypt(req, &sg, NULL, len);
-		r = crypto_wait_req(crypto_ahash_update(req), wait);
-
-		if (unlikely(r < 0)) {
-			DMERR("verity_for_io_block crypto op failed: %d", r);
-			return r;
-		}
+		sg_set_page(&sg[*nents], bv.bv_page, len, bv.bv_offset);
 
 		bio_advance_iter(bio, iter, len);
 		todo -= len;
+		(*nents)++;
+		(*total_len) += len;
 	} while (todo);
+}
 
-	return 0;
+/* calculate how many buffers required to accomudate bio_vec starting
+ * from iter
+ */
+unsigned int verity_calc_buffs_for_bv(struct dm_verity *v,
+	struct dm_verity_io *io, struct bvec_iter iter)
+{
+	unsigned int todo = 1 << v->data_dev_block_bits;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+	unsigned int buff_count = 0;
+
+	do {
+		unsigned int len;
+		struct bio_vec bv = bio_iter_iovec(bio, iter);
+
+		len = bv.bv_len;
+		if (likely(len >= todo))
+			len = todo;
+
+		bio_advance_iter(bio, &iter, len);
+		todo -= len;
+		buff_count++;
+	} while (todo);
+
+	return buff_count;
 }
 
 /*
@@ -442,16 +446,30 @@ static int verity_verify_io(struct dm_verity_io *io)
 	struct bvec_iter start;
 	unsigned b;
 	struct crypto_wait wait;
+	struct scatterlist *sg;
+	int r;
 
 	for (b = 0; b < io->n_blocks; b++) {
-		int r;
+		unsigned int nents;
+		unsigned int total_len = 0;
+		unsigned int num_of_buffs = 0;
 		struct ahash_request *req = verity_io_hash_req(v, io);
 
+		/* an extra one for the salt buffer */
+		num_of_buffs = verity_calc_buffs_for_bv(v, io, io->iter) + 1;
+		WARN_ON(num_of_buffs < 1);
+
+		sg = kmalloc_array(num_of_buffs, sizeof(struct scatterlist),
+				     GFP_KERNEL);
+		if (!sg)
+			return -ENOMEM;
+		sg_init_table(sg, num_of_buffs);
+
 		r = verity_hash_for_block(v, io, io->block + b,
 					  verity_io_want_digest(v, io),
 					  &is_zero);
 		if (unlikely(r < 0))
-			return r;
+			goto err_memfree;
 
 		if (is_zero) {
 			/*
@@ -461,25 +479,37 @@ static int verity_verify_io(struct dm_verity_io *io)
 			r = verity_for_bv_block(v, io, &io->iter,
 						verity_bv_zero);
 			if (unlikely(r < 0))
-				return r;
+				goto err_memfree;
 
 			continue;
 		}
 
-		r = verity_hash_init(v, req, &wait);
-		if (unlikely(r < 0))
-			return r;
+		ahash_request_set_tfm(req, v->tfm);
+		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					crypto_req_done, (void *)&wait);
+		nents = 0;
+		total_len = 0;
+		if (verity_is_salt_required(v, START_SG))
+			verity_add_salt(v, sg, &nents, &total_len);
 
 		start = io->iter;
-		r = verity_for_io_block(v, io, &io->iter, &wait);
-		if (unlikely(r < 0))
-			return r;
+		verity_for_io_block(v, io, &io->iter, sg, &nents, &total_len);
+		if (verity_is_salt_required(v, END_SG))
+			verity_add_salt(v, sg, &nents, &total_len);
+		/*
+		 * need to mark end of chain, since we might have allocated
+		 * more than we actually use
+		 */
+		sg_mark_end(&sg[nents-1]);
+		ahash_request_set_crypt(req, sg, verity_io_real_digest(v, io),
+				total_len);
+		crypto_init_wait(&wait);
+		r = crypto_wait_req(crypto_ahash_digest(req), &wait);
 
-		r = verity_hash_final(v, req, verity_io_real_digest(v, io),
-					&wait);
 		if (unlikely(r < 0))
-			return r;
-
+			goto err_memfree;
+		kfree(sg);
 		if (likely(memcmp(verity_io_real_digest(v, io),
 				  verity_io_want_digest(v, io), v->digest_size) == 0))
 			continue;
@@ -492,6 +522,10 @@ static int verity_verify_io(struct dm_verity_io *io)
 	}
 
 	return 0;
+
+err_memfree:
+	kfree(sg);
+	return r;
 }
 
 /*
-- 
2.7.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-25 18:41 [PATCH 1/2] md: dm-verity: aggregate crypto API calls Yael Chemla
@ 2018-03-25 18:41 ` Yael Chemla
  2018-03-26  6:31   ` Gilad Ben-Yossef
                     ` (2 more replies)
  2018-03-26  6:31 ` [PATCH 1/2] md: dm-verity: aggregate crypto API calls Gilad Ben-Yossef
  1 sibling, 3 replies; 14+ messages in thread
From: Yael Chemla @ 2018-03-25 18:41 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, dm-devel, linux-kernel,
	ofir.drang, Yael Chemla
  Cc: Yael Chemla

 Allow parallel processing of bio blocks by moving to async. completion
 handling. This allows for better resource utilization of both HW and
 software based hash tfm and therefore better performance in many cases,
 depending on the specific tfm in use.
 
 Tested on ARM32 (zynq board) and ARM64 (Juno board).
 Time of cat command was measured on a filesystem with various file sizes.
 12% performance improvement when HW based hash was used (ccree driver).
 SW based hash showed less than 1% improvement.
 CPU utilization when HW based hash was used presented 10% less context
 switch, 4% less cycles and 7% less instructions. No difference in
 CPU utilization noticed with SW based hash.
 
Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
---
 drivers/md/dm-verity-fec.c    |  10 +-
 drivers/md/dm-verity-fec.h    |   7 +-
 drivers/md/dm-verity-target.c | 215 +++++++++++++++++++++++++++++++-----------
 drivers/md/dm-verity.h        |   4 +-
 4 files changed, 173 insertions(+), 63 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index e13f908..bcea307 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -203,13 +203,12 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
  */
 static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 			 u64 rsb, u64 target, unsigned block_offset,
-			 int *neras)
+			 int *neras, struct dm_verity_fec_io *fio)
 {
 	bool is_zero;
 	int i, j, target_index = -1;
 	struct dm_buffer *buf;
 	struct dm_bufio_client *bufio;
-	struct dm_verity_fec_io *fio = fec_io(io);
 	u64 block, ileaved;
 	u8 *bbuf, *rs_block;
 	u8 want_digest[v->digest_size];
@@ -265,7 +264,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 
 		/* locate erasures if the block is on the data device */
 		if (bufio == v->fec->data_bufio &&
-		    verity_hash_for_block(v, io, block, want_digest,
+		    verity_hash_for_block(v, io, block, want_digest, fio,
 					  &is_zero) == 0) {
 			/* skip known zero blocks entirely */
 			if (is_zero)
@@ -374,7 +373,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 		fec_init_bufs(v, fio);
 
 		r = fec_read_bufs(v, io, rsb, offset, pos,
-				  use_erasures ? &neras : NULL);
+				  use_erasures ? &neras : NULL, fio);
 		if (unlikely(r < 0))
 			return r;
 
@@ -419,10 +418,9 @@ static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
  */
 int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 		      enum verity_block_type type, sector_t block, u8 *dest,
-		      struct bvec_iter *iter)
+		      struct bvec_iter *iter, struct dm_verity_fec_io *fio)
 {
 	int r;
-	struct dm_verity_fec_io *fio = fec_io(io);
 	u64 offset, res, rsb;
 
 	if (!verity_fec_is_enabled(v))
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce8..46c2f47 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -73,7 +73,9 @@ extern bool verity_fec_is_enabled(struct dm_verity *v);
 
 extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 			     enum verity_block_type type, sector_t block,
-			     u8 *dest, struct bvec_iter *iter);
+			     u8 *dest, struct bvec_iter *iter,
+			     struct dm_verity_fec_io *fio);
+
 
 extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
 					char *result, unsigned maxlen);
@@ -104,7 +106,8 @@ static inline int verity_fec_decode(struct dm_verity *v,
 				    struct dm_verity_io *io,
 				    enum verity_block_type type,
 				    sector_t block, u8 *dest,
-				    struct bvec_iter *iter)
+				    struct bvec_iter *iter,
+				    struct dm_verity_fec_io *fio)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index a281b83..30512ee 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -38,7 +38,35 @@
 /* only two elements in static scatter list: salt and data */
 #define SG_FIXED_ITEMS	2
 
+struct dm_ver_io_data {
+	atomic_t expected_reqs;
+	atomic_t err;
+	int total_reqs;
+	struct dm_ver_req_data *reqdata_arr;
+	struct dm_verity_io *io;
+};
+
+#define MAX_DIGEST_SIZE 64 /* as big as sha512 digest size */
+
+struct dm_ver_req_data {
+	u8 want_digest[MAX_DIGEST_SIZE];
+	u8 real_digest[MAX_DIGEST_SIZE];
+	struct dm_verity_fec_io fec_io;
+	unsigned int iblock;	// block index in the request
+	unsigned int digest_size;
+	struct scatterlist *sg;
+	struct dm_ver_io_data *iodata;
+	/* req field is the last on purpose since it's not fixed in size and
+	 *  its size if calucluated using ahash_request_alloc or an addition of
+	 *  the required size is done with +crypto_ahash_reqsize(v->tfm)
+	 */
+	struct ahash_request *req;
+};
+
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+static void verity_finish_io(struct dm_verity_io *io, blk_status_t status);
+static void verity_release_req(struct dm_ver_io_data *iodata);
+
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
@@ -102,7 +130,7 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 
 /*
  * verity_is_salt_required - check if according to verity version and
- * verity salt's size if there's a need to insert a salt.
+ * verity salt's size there's a need to insert a salt.
  * note: salt goes last for 0th version and first for all others
  * @where - START_SG - before buffer / END_SG - after buffer
  */
@@ -247,7 +275,7 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
  */
 static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			       sector_t block, int level, bool skip_unverified,
-			       u8 *want_digest)
+			       u8 *want_digest, struct dm_verity_fec_io *fec_io)
 {
 	struct dm_buffer *buf;
 	struct buffer_aux *aux;
@@ -281,7 +309,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			aux->hash_verified = 1;
 		else if (verity_fec_decode(v, io,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
-					   hash_block, data, NULL) == 0)
+					   hash_block, data, NULL, fec_io) == 0)
 			aux->hash_verified = 1;
 		else if (verity_handle_err(v,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
@@ -305,7 +333,9 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
  * of the hash tree if necessary.
  */
 int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
-			  sector_t block, u8 *digest, bool *is_zero)
+			  sector_t block, u8 *digest,
+			  struct dm_verity_fec_io *fec_io,
+			  bool *is_zero)
 {
 	int r = 0, i;
 
@@ -317,7 +347,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 		 * function returns 1 and we fall back to whole
 		 * chain verification.
 		 */
-		r = verity_verify_level(v, io, block, 0, true, digest);
+		r = verity_verify_level(v, io, block, 0, true, digest, fec_io);
 		if (likely(r <= 0))
 			goto out;
 	}
@@ -325,7 +355,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 	memcpy(digest, v->root_digest, v->digest_size);
 
 	for (i = v->levels - 1; i >= 0; i--) {
-		r = verity_verify_level(v, io, block, i, false, digest);
+		r = verity_verify_level(v, io, block, i, false, digest, fec_io);
 		if (unlikely(r))
 			goto out;
 	}
@@ -436,39 +466,118 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
 	return 0;
 }
 
+
+static void verity_cb_complete(struct dm_ver_io_data *iodata, int err)
+{
+	struct dm_verity_io *io = iodata->io;
+
+	/* save last error occurred */
+	if (err)
+		atomic_set(&iodata->err, err);
+	if (atomic_dec_and_test(&iodata->expected_reqs)) {
+		verity_release_req(iodata);
+		verity_finish_io(io, errno_to_blk_status(err));
+	}
+}
+
+static void __single_block_req_done(struct dm_ver_req_data *req_data,
+					int err, struct dm_verity_io *io)
+{
+	struct dm_verity *v = io->v;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	WARN_ON((err != 0) || (req_data == NULL) || (req_data->iodata == NULL));
+	if ((err != 0) || (req_data == NULL) || (req_data->iodata == NULL))
+		goto complete;
+
+	kfree(req_data->sg);
+
+	if (likely(memcmp(req_data->real_digest,
+			req_data->want_digest, req_data->digest_size) == 0))
+		goto complete;
+	else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
+			io->block + req_data->iblock, NULL, &io->iter,
+			&req_data->fec_io) == 0){
+		goto complete;
+	} else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+		req_data->iodata->io->block + req_data->iblock)){
+		err = -EIO;
+		goto complete;
+	}
+
+complete:
+	ahash_request_free(req_data->req);
+	verity_cb_complete(req_data->iodata, err);
+}
+
+static void single_block_req_done(struct crypto_async_request *req, int err)
+{
+	struct dm_ver_req_data *req_data = req->data;
+
+	__single_block_req_done(req_data, err, req_data->iodata->io);
+}
+
+static void verity_release_req(struct dm_ver_io_data *iodata)
+{
+	kfree(iodata->reqdata_arr);
+	kfree(iodata);
+}
 /*
  * Verify one "dm_verity_io" structure.
  */
-static int verity_verify_io(struct dm_verity_io *io)
+static void verity_verify_io(struct dm_verity_io *io)
 {
 	bool is_zero;
 	struct dm_verity *v = io->v;
-	struct bvec_iter start;
-	unsigned b;
-	struct crypto_wait wait;
-	struct scatterlist *sg;
-	int r;
+	unsigned int b = 0, blocks = 0;
+	struct dm_ver_io_data *iodata = NULL;
+	struct dm_ver_req_data *reqdata_arr = NULL;
+	struct scatterlist *sg = NULL;
+	int res;
+
+	iodata = kmalloc(sizeof(*iodata), GFP_KERNEL);
+	reqdata_arr = kmalloc_array(io->n_blocks,
+			sizeof(struct dm_ver_req_data), GFP_KERNEL);
+	if (unlikely((iodata == NULL) || (reqdata_arr == NULL))) {
+		WARN_ON((iodata == NULL) || (reqdata_arr == NULL));
+		goto err_memfree;
+	}
+	atomic_set(&iodata->expected_reqs, io->n_blocks);
+	iodata->reqdata_arr = reqdata_arr;
+	iodata->io = io;
+	iodata->total_reqs = blocks = io->n_blocks;
+
 
-	for (b = 0; b < io->n_blocks; b++) {
+	for (b = 0; b < blocks; b++) {
 		unsigned int nents;
 		unsigned int total_len = 0;
 		unsigned int num_of_buffs = 0;
-		struct ahash_request *req = verity_io_hash_req(v, io);
 
-		/* an extra one for the salt buffer */
+		reqdata_arr[b].req = ahash_request_alloc(v->tfm, GFP_KERNEL);
+
+		if (unlikely(reqdata_arr[b].req == NULL))
+			goto err_memfree;
+		/* +1 for the salt buffer */
+		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
 		num_of_buffs = verity_calc_buffs_for_bv(v, io, io->iter) + 1;
 		WARN_ON(num_of_buffs < 1);
 
 		sg = kmalloc_array(num_of_buffs, sizeof(struct scatterlist),
 				     GFP_KERNEL);
-		if (!sg)
-			return -ENOMEM;
+		if (!sg) {
+			DMERR("%s kmalloc_array failed", __func__);
+			goto err_memfree;
+		}
+
 		sg_init_table(sg, num_of_buffs);
 
-		r = verity_hash_for_block(v, io, io->block + b,
-					  verity_io_want_digest(v, io),
-					  &is_zero);
-		if (unlikely(r < 0))
+		res = verity_hash_for_block(v, io, io->block + b,
+					    reqdata_arr[b].want_digest,
+					    &reqdata_arr[b].fec_io,
+					    &is_zero);
+		if (unlikely(res < 0))
 			goto err_memfree;
 
 		if (is_zero) {
@@ -476,56 +585,54 @@ static int verity_verify_io(struct dm_verity_io *io)
 			 * If we expect a zero block, don't validate, just
 			 * return zeros.
 			 */
-			r = verity_for_bv_block(v, io, &io->iter,
+			res = verity_for_bv_block(v, io, &io->iter,
 						verity_bv_zero);
-			if (unlikely(r < 0))
+			if (unlikely(res < 0))
 				goto err_memfree;
-
+			verity_cb_complete(reqdata_arr[b].iodata, res);
 			continue;
 		}
 
-		ahash_request_set_tfm(req, v->tfm);
-		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-					CRYPTO_TFM_REQ_MAY_BACKLOG,
-					crypto_req_done, (void *)&wait);
 		nents = 0;
 		total_len = 0;
 		if (verity_is_salt_required(v, START_SG))
 			verity_add_salt(v, sg, &nents, &total_len);
 
-		start = io->iter;
-		verity_for_io_block(v, io, &io->iter, sg, &nents, &total_len);
+		verity_for_io_block(v, io, &io->iter, sg, &nents,
+				&total_len);
 		if (verity_is_salt_required(v, END_SG))
 			verity_add_salt(v, sg, &nents, &total_len);
-		/*
-		 * need to mark end of chain, since we might have allocated
-		 * more than we actually use
+		reqdata_arr[b].iodata = iodata;
+		reqdata_arr[b].sg = sg;
+		reqdata_arr[b].digest_size = v->digest_size;
+		reqdata_arr[b].iblock = b;
+		/* need to mark end of chain,
+		 * since we might have allocated more than we actually use
 		 */
 		sg_mark_end(&sg[nents-1]);
-		ahash_request_set_crypt(req, sg, verity_io_real_digest(v, io),
-				total_len);
-		crypto_init_wait(&wait);
-		r = crypto_wait_req(crypto_ahash_digest(req), &wait);
 
-		if (unlikely(r < 0))
-			goto err_memfree;
-		kfree(sg);
-		if (likely(memcmp(verity_io_real_digest(v, io),
-				  verity_io_want_digest(v, io), v->digest_size) == 0))
-			continue;
-		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
-					   io->block + b, NULL, &start) == 0)
-			continue;
-		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					   io->block + b))
-			return -EIO;
-	}
+		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
+		ahash_request_set_callback(reqdata_arr[b].req,
+			CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG,
+			single_block_req_done, (void *)&reqdata_arr[b]);
 
-	return 0;
+		ahash_request_set_crypt(reqdata_arr[b].req, sg,
+			reqdata_arr[b].real_digest, total_len);
+		res = crypto_ahash_digest(reqdata_arr[b].req);
+		/* digest completed already, callback won't be called. */
+		if (res == 0)
+			__single_block_req_done(&reqdata_arr[b], res, io);
+
+	}
+	return;
 
 err_memfree:
-	kfree(sg);
-	return r;
+	DMERR("%s: error occurred", __func__);
+	/* reduce expected requests by the number of
+	 * unsent requests, -1 accounting for the current block
+	 */
+	atomic_sub(blocks-b-1, &iodata->expected_reqs);
+	verity_cb_complete(iodata, -EIO);
 }
 
 /*
@@ -548,7 +655,7 @@ static void verity_work(struct work_struct *w)
 {
 	struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
 
-	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
+	verity_verify_io(io);
 }
 
 static void verity_end_io(struct bio *bio)
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index b675bc0..0e407f6 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -30,6 +30,7 @@ enum verity_block_type {
 };
 
 struct dm_verity_fec;
+struct dm_verity_fec_io;
 
 struct dm_verity {
 	struct dm_dev *data_dev;
@@ -124,6 +125,7 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		       const u8 *data, size_t len, u8 *digest);
 
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
-				 sector_t block, u8 *digest, bool *is_zero);
+				 sector_t block, u8 *digest,
+				 struct dm_verity_fec_io *fio, bool *is_zero);
 
 #endif /* DM_VERITY_H */
-- 
2.7.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] md: dm-verity: aggregate crypto API calls
  2018-03-25 18:41 [PATCH 1/2] md: dm-verity: aggregate crypto API calls Yael Chemla
  2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
@ 2018-03-26  6:31 ` Gilad Ben-Yossef
  1 sibling, 0 replies; 14+ messages in thread
From: Gilad Ben-Yossef @ 2018-03-26  6:31 UTC (permalink / raw)
  To: Yael Chemla
  Cc: Alasdair Kergon, Mike Snitzer, device-mapper development,
	Linux kernel mailing list, ofir.drang, Yael Chemla

On Sun, Mar 25, 2018 at 9:41 PM, Yael Chemla <yael.chemla@foss.arm.com> wrote:
>  Current implementation makes multiple crypto API calls per block
>  implementation makes multiple crypto API calls per single block,
>  forcing underlying crypto tfm implementation to "stop & go",
>  leading to under utilization of HW engines.
>  To fix it unify calls to crypto init/update/final into a digest call
>  with a single sg which contains multiple buffers.
>  This is also needed as an enabler for the next patch in the series.
>
>
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> ---
>  drivers/md/dm-verity-target.c | 220 ++++++++++++++++++++++++------------------
>  1 file changed, 127 insertions(+), 93 deletions(-)



Reviewed-by: Gilad Ben-Yossef <gilad@benyossef.com>



-- 
Gilad Ben-Yossef
Chief Coffee Drinker

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
 -- Jean-Baptiste Queru

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
@ 2018-03-26  6:31   ` Gilad Ben-Yossef
  2018-03-27  1:06   ` Mike Snitzer
  2018-03-27  6:55   ` [dm-devel] " Eric Biggers
  2 siblings, 0 replies; 14+ messages in thread
From: Gilad Ben-Yossef @ 2018-03-26  6:31 UTC (permalink / raw)
  To: Yael Chemla
  Cc: Alasdair Kergon, Mike Snitzer, device-mapper development,
	Linux kernel mailing list, ofir.drang, Yael Chemla

On Sun, Mar 25, 2018 at 9:41 PM, Yael Chemla <yael.chemla@foss.arm.com> wrote:
>  Allow parallel processing of bio blocks by moving to async. completion
>  handling. This allows for better resource utilization of both HW and
>  software based hash tfm and therefore better performance in many cases,
>  depending on the specific tfm in use.
>
>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>  Time of cat command was measured on a filesystem with various file sizes.
>  12% performance improvement when HW based hash was used (ccree driver).
>  SW based hash showed less than 1% improvement.
>  CPU utilization when HW based hash was used presented 10% less context
>  switch, 4% less cycles and 7% less instructions. No difference in
>  CPU utilization noticed with SW based hash.
>
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> ---
>  drivers/md/dm-verity-fec.c    |  10 +-
>  drivers/md/dm-verity-fec.h    |   7 +-
>  drivers/md/dm-verity-target.c | 215 +++++++++++++++++++++++++++++++-----------
>  drivers/md/dm-verity.h        |   4 +-
>  4 files changed, 173 insertions(+), 63 deletions(-)

Reviewed-by: Gilad Ben-Yossef <gilad@benyossef.com>

-- 
Gilad Ben-Yossef
Chief Coffee Drinker

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
 -- Jean-Baptiste Queru

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
  2018-03-26  6:31   ` Gilad Ben-Yossef
@ 2018-03-27  1:06   ` Mike Snitzer
  2018-03-27  6:52     ` Gilad Ben-Yossef
  2018-03-27  8:55     ` yael.chemla
  2018-03-27  6:55   ` [dm-devel] " Eric Biggers
  2 siblings, 2 replies; 14+ messages in thread
From: Mike Snitzer @ 2018-03-27  1:06 UTC (permalink / raw)
  To: Yael Chemla
  Cc: Alasdair Kergon, dm-devel, linux-kernel, ofir.drang, Yael Chemla

On Sun, Mar 25 2018 at  2:41pm -0400,
Yael Chemla <yael.chemla@foss.arm.com> wrote:

>  Allow parallel processing of bio blocks by moving to async. completion
>  handling. This allows for better resource utilization of both HW and
>  software based hash tfm and therefore better performance in many cases,
>  depending on the specific tfm in use.
>  
>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>  Time of cat command was measured on a filesystem with various file sizes.
>  12% performance improvement when HW based hash was used (ccree driver).
>  SW based hash showed less than 1% improvement.
>  CPU utilization when HW based hash was used presented 10% less context
>  switch, 4% less cycles and 7% less instructions. No difference in
>  CPU utilization noticed with SW based hash.
>  
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>

This one had various issues.  I've fixed most of what I saw and staged
in linux-next (purely for build test coverage purposes).  I may drop
this patch if others disagree with it (or my sg deallocation in the
error path question isn't answered).

I've staged the changes here (and in linux-next via 'for-next'):
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/log/?h=dm-4.17

I switched all the new GFP_KERNEL uses to GFP_NOIO.  The fact that
you're doing allocations at all (per IO) is bad enough.  Using
GFP_KERNEL is a serious liability (risk of deadlock if dm-verity were to
be used for something like.. swap.. weird setup but possible).

But the gfp flags aside, the need for additional memory and the
expectation of scalable async parallel IO is potentially at odds with
changes like this (that I just staged, and had to rebase your 2 patches
ontop of):
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-4.17&id=a89f6a2cfec86fba7a115642ff082cb4e9450ea6

So I'm particulalry interested to hear from google folks to understand
if they are OK with your proposed verity async crypto API use.

Mike

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  1:06   ` Mike Snitzer
@ 2018-03-27  6:52     ` Gilad Ben-Yossef
  2018-03-27  8:55     ` yael.chemla
  1 sibling, 0 replies; 14+ messages in thread
From: Gilad Ben-Yossef @ 2018-03-27  6:52 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Yael Chemla, Alasdair Kergon, device-mapper development,
	Linux kernel mailing list, ofir.drang, Yael Chemla

On Tue, Mar 27, 2018 at 4:06 AM, Mike Snitzer <snitzer@redhat.com> wrote:
> On Sun, Mar 25 2018 at  2:41pm -0400,
> Yael Chemla <yael.chemla@foss.arm.com> wrote:
>
>>  Allow parallel processing of bio blocks by moving to async. completion
>>  handling. This allows for better resource utilization of both HW and
>>  software based hash tfm and therefore better performance in many cases,
>>  depending on the specific tfm in use.
>>
>>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>>  Time of cat command was measured on a filesystem with various file sizes.
>>  12% performance improvement when HW based hash was used (ccree driver).
>>  SW based hash showed less than 1% improvement.
>>  CPU utilization when HW based hash was used presented 10% less context
>>  switch, 4% less cycles and 7% less instructions. No difference in
>>  CPU utilization noticed with SW based hash.
>>
>> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
>
> This one had various issues.  I've fixed most of what I saw and staged
> in linux-next (purely for build test coverage purposes).  I may drop
> this patch if others disagree with it (or my sg deallocation in the
> error path question isn't answered).
>
> I've staged the changes here (and in linux-next via 'for-next'):
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/log/?h=dm-4.17
>
> I switched all the new GFP_KERNEL uses to GFP_NOIO.  The fact that
> you're doing allocations at all (per IO) is bad enough.  Using
> GFP_KERNEL is a serious liability (risk of deadlock if dm-verity were to
> be used for something like.. swap.. weird setup but possible).

Out of my curiosity, since I thought whether or not this should use
GFP_NOIO during my review
but than answered to myself "Nah, dm-verity is read only, can't swap
to that" - how does one
use a read only DM-Verity to host swap partition/file? :-)


>
> But the gfp flags aside, the need for additional memory and the
> expectation of scalable async parallel IO is potentially at odds with
> changes like this (that I just staged, and had to rebase your 2 patches
> ontop of):
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-4.17&id=a89f6a2cfec86fba7a115642ff082cb4e9450ea6
>
> So I'm particulalry interested to hear from google folks to understand
> if they are OK with your proposed verity async crypto API use.

If by "scalable async parallel IO" you mean crypto HW than for what
it's worth, my experience is that makers of devices with is less
powerful CPUs
are the ones that tend to add them, so they stands to benefit  the
most of this change.

Gilad

-- 
Gilad Ben-Yossef
Chief Coffee Drinker

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
 -- Jean-Baptiste Queru

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
  2018-03-26  6:31   ` Gilad Ben-Yossef
  2018-03-27  1:06   ` Mike Snitzer
@ 2018-03-27  6:55   ` Eric Biggers
  2018-03-27  8:05     ` Milan Broz
                       ` (2 more replies)
  2 siblings, 3 replies; 14+ messages in thread
From: Eric Biggers @ 2018-03-27  6:55 UTC (permalink / raw)
  To: Yael Chemla
  Cc: Alasdair Kergon, Mike Snitzer, dm-devel, linux-kernel,
	ofir.drang, Yael Chemla, linux-crypto, gilad

[+Cc linux-crypto]

Hi Yael,

On Sun, Mar 25, 2018 at 07:41:30PM +0100, Yael Chemla wrote:
>  Allow parallel processing of bio blocks by moving to async. completion
>  handling. This allows for better resource utilization of both HW and
>  software based hash tfm and therefore better performance in many cases,
>  depending on the specific tfm in use.
>  
>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>  Time of cat command was measured on a filesystem with various file sizes.
>  12% performance improvement when HW based hash was used (ccree driver).
>  SW based hash showed less than 1% improvement.
>  CPU utilization when HW based hash was used presented 10% less context
>  switch, 4% less cycles and 7% less instructions. No difference in
>  CPU utilization noticed with SW based hash.
>  
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>

Okay, I definitely would like to see dm-verity better support hardware crypto
accelerators, but these patches were painful to read.

There are lots of smaller bugs, but the high-level problem which you need to
address first is that on every bio you are always allocating all the extra
memory to hold a hash request and scatterlist for every data block.  This will
not only hurt performance when the hashing is done in software (I'm skeptical
that your performance numbers are representative of that case), but it will also
fall apart under memory pressure.  We are trying to get low-end Android devices
to start using dm-verity, and such devices often have only 1 GB or even only 512
MB of RAM, so memory allocations are at increased risk of failing.  In fact I'm
pretty sure you didn't do any proper stress testing of these patches, since the
first thing they do for every bio is try to allocate a physically contiguous
array that is nearly as long as the full bio data itself (n_blocks *
sizeof(struct dm_verity_req_data) = n_blocks * 3264, at least on a 64-bit
platform, mostly due to the 'struct dm_verity_fec_io'), so potentially up to
about 1 MB; that's going to fail a lot even on systems with gigabytes of RAM...

(You also need to verify that your new code is compatible with the forward error
correction feature, with the "ignore_zero_blocks" option, and with the new
"check_at_most_once" option.  From my reading of the code, all of those seemed
broken; the dm_verity_fec_io structures, for example, weren't even being
initialized...)

I think you need to take a close look at how dm-crypt handles async crypto
implementations, since it seems to do it properly without hurting the common
case where the crypto happens synchronously.  What it does, is it reserves space
in the per-bio data for a single cipher request.  Then, *only* if the cipher
implementation actually processes the request asynchronously (as indicated by
-EINPROGRESS being returned) is a new cipher request allocated dynamically,
using a mempool (not kmalloc, which is prone to fail).  Note that unlike your
patches it also properly handles the case where the hardware crypto queue is
full, as indicated by the cipher implementation returning -EBUSY; in that case,
dm-crypt waits to start another request until there is space in the queue.

I think it would be possible to adapt dm-crypt's solution to dm-verity.

Thanks,

Eric

> ---
>  drivers/md/dm-verity-fec.c    |  10 +-
>  drivers/md/dm-verity-fec.h    |   7 +-
>  drivers/md/dm-verity-target.c | 215 +++++++++++++++++++++++++++++++-----------
>  drivers/md/dm-verity.h        |   4 +-
>  4 files changed, 173 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
> index e13f908..bcea307 100644
> --- a/drivers/md/dm-verity-fec.c
> +++ b/drivers/md/dm-verity-fec.c
> @@ -203,13 +203,12 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
>   */
>  static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
>  			 u64 rsb, u64 target, unsigned block_offset,
> -			 int *neras)
> +			 int *neras, struct dm_verity_fec_io *fio)
>  {
>  	bool is_zero;
>  	int i, j, target_index = -1;
>  	struct dm_buffer *buf;
>  	struct dm_bufio_client *bufio;
> -	struct dm_verity_fec_io *fio = fec_io(io);
>  	u64 block, ileaved;
>  	u8 *bbuf, *rs_block;
>  	u8 want_digest[v->digest_size];
> @@ -265,7 +264,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
>  
>  		/* locate erasures if the block is on the data device */
>  		if (bufio == v->fec->data_bufio &&
> -		    verity_hash_for_block(v, io, block, want_digest,
> +		    verity_hash_for_block(v, io, block, want_digest, fio,
>  					  &is_zero) == 0) {
>  			/* skip known zero blocks entirely */
>  			if (is_zero)
> @@ -374,7 +373,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
>  		fec_init_bufs(v, fio);
>  
>  		r = fec_read_bufs(v, io, rsb, offset, pos,
> -				  use_erasures ? &neras : NULL);
> +				  use_erasures ? &neras : NULL, fio);
>  		if (unlikely(r < 0))
>  			return r;
>  
> @@ -419,10 +418,9 @@ static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
>   */
>  int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
>  		      enum verity_block_type type, sector_t block, u8 *dest,
> -		      struct bvec_iter *iter)
> +		      struct bvec_iter *iter, struct dm_verity_fec_io *fio)
>  {
>  	int r;
> -	struct dm_verity_fec_io *fio = fec_io(io);
>  	u64 offset, res, rsb;
>  
>  	if (!verity_fec_is_enabled(v))
> diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
> index bb31ce8..46c2f47 100644
> --- a/drivers/md/dm-verity-fec.h
> +++ b/drivers/md/dm-verity-fec.h
> @@ -73,7 +73,9 @@ extern bool verity_fec_is_enabled(struct dm_verity *v);
>  
>  extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
>  			     enum verity_block_type type, sector_t block,
> -			     u8 *dest, struct bvec_iter *iter);
> +			     u8 *dest, struct bvec_iter *iter,
> +			     struct dm_verity_fec_io *fio);
> +
>  
>  extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
>  					char *result, unsigned maxlen);
> @@ -104,7 +106,8 @@ static inline int verity_fec_decode(struct dm_verity *v,
>  				    struct dm_verity_io *io,
>  				    enum verity_block_type type,
>  				    sector_t block, u8 *dest,
> -				    struct bvec_iter *iter)
> +				    struct bvec_iter *iter,
> +				    struct dm_verity_fec_io *fio)
>  {
>  	return -EOPNOTSUPP;
>  }
> diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
> index a281b83..30512ee 100644
> --- a/drivers/md/dm-verity-target.c
> +++ b/drivers/md/dm-verity-target.c
> @@ -38,7 +38,35 @@
>  /* only two elements in static scatter list: salt and data */
>  #define SG_FIXED_ITEMS	2
>  
> +struct dm_ver_io_data {
> +	atomic_t expected_reqs;
> +	atomic_t err;
> +	int total_reqs;
> +	struct dm_ver_req_data *reqdata_arr;
> +	struct dm_verity_io *io;
> +};
> +
> +#define MAX_DIGEST_SIZE 64 /* as big as sha512 digest size */
> +
> +struct dm_ver_req_data {
> +	u8 want_digest[MAX_DIGEST_SIZE];
> +	u8 real_digest[MAX_DIGEST_SIZE];
> +	struct dm_verity_fec_io fec_io;
> +	unsigned int iblock;	// block index in the request
> +	unsigned int digest_size;
> +	struct scatterlist *sg;
> +	struct dm_ver_io_data *iodata;
> +	/* req field is the last on purpose since it's not fixed in size and
> +	 *  its size if calucluated using ahash_request_alloc or an addition of
> +	 *  the required size is done with +crypto_ahash_reqsize(v->tfm)
> +	 */
> +	struct ahash_request *req;
> +};
> +
>  static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
> +static void verity_finish_io(struct dm_verity_io *io, blk_status_t status);
> +static void verity_release_req(struct dm_ver_io_data *iodata);
> +
>  
>  module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
>  
> @@ -102,7 +130,7 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
>  
>  /*
>   * verity_is_salt_required - check if according to verity version and
> - * verity salt's size if there's a need to insert a salt.
> + * verity salt's size there's a need to insert a salt.
>   * note: salt goes last for 0th version and first for all others
>   * @where - START_SG - before buffer / END_SG - after buffer
>   */
> @@ -247,7 +275,7 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
>   */
>  static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>  			       sector_t block, int level, bool skip_unverified,
> -			       u8 *want_digest)
> +			       u8 *want_digest, struct dm_verity_fec_io *fec_io)
>  {
>  	struct dm_buffer *buf;
>  	struct buffer_aux *aux;
> @@ -281,7 +309,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>  			aux->hash_verified = 1;
>  		else if (verity_fec_decode(v, io,
>  					   DM_VERITY_BLOCK_TYPE_METADATA,
> -					   hash_block, data, NULL) == 0)
> +					   hash_block, data, NULL, fec_io) == 0)
>  			aux->hash_verified = 1;
>  		else if (verity_handle_err(v,
>  					   DM_VERITY_BLOCK_TYPE_METADATA,
> @@ -305,7 +333,9 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>   * of the hash tree if necessary.
>   */
>  int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
> -			  sector_t block, u8 *digest, bool *is_zero)
> +			  sector_t block, u8 *digest,
> +			  struct dm_verity_fec_io *fec_io,
> +			  bool *is_zero)
>  {
>  	int r = 0, i;
>  
> @@ -317,7 +347,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
>  		 * function returns 1 and we fall back to whole
>  		 * chain verification.
>  		 */
> -		r = verity_verify_level(v, io, block, 0, true, digest);
> +		r = verity_verify_level(v, io, block, 0, true, digest, fec_io);
>  		if (likely(r <= 0))
>  			goto out;
>  	}
> @@ -325,7 +355,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
>  	memcpy(digest, v->root_digest, v->digest_size);
>  
>  	for (i = v->levels - 1; i >= 0; i--) {
> -		r = verity_verify_level(v, io, block, i, false, digest);
> +		r = verity_verify_level(v, io, block, i, false, digest, fec_io);
>  		if (unlikely(r))
>  			goto out;
>  	}
> @@ -436,39 +466,118 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
>  	return 0;
>  }
>  
> +
> +static void verity_cb_complete(struct dm_ver_io_data *iodata, int err)
> +{
> +	struct dm_verity_io *io = iodata->io;
> +
> +	/* save last error occurred */
> +	if (err)
> +		atomic_set(&iodata->err, err);
> +	if (atomic_dec_and_test(&iodata->expected_reqs)) {
> +		verity_release_req(iodata);
> +		verity_finish_io(io, errno_to_blk_status(err));
> +	}
> +}
> +
> +static void __single_block_req_done(struct dm_ver_req_data *req_data,
> +					int err, struct dm_verity_io *io)
> +{
> +	struct dm_verity *v = io->v;
> +
> +	if (err == -EINPROGRESS)
> +		return;
> +
> +	WARN_ON((err != 0) || (req_data == NULL) || (req_data->iodata == NULL));
> +	if ((err != 0) || (req_data == NULL) || (req_data->iodata == NULL))
> +		goto complete;
> +
> +	kfree(req_data->sg);
> +
> +	if (likely(memcmp(req_data->real_digest,
> +			req_data->want_digest, req_data->digest_size) == 0))
> +		goto complete;
> +	else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
> +			io->block + req_data->iblock, NULL, &io->iter,
> +			&req_data->fec_io) == 0){
> +		goto complete;
> +	} else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> +		req_data->iodata->io->block + req_data->iblock)){
> +		err = -EIO;
> +		goto complete;
> +	}
> +
> +complete:
> +	ahash_request_free(req_data->req);
> +	verity_cb_complete(req_data->iodata, err);
> +}
> +
> +static void single_block_req_done(struct crypto_async_request *req, int err)
> +{
> +	struct dm_ver_req_data *req_data = req->data;
> +
> +	__single_block_req_done(req_data, err, req_data->iodata->io);
> +}
> +
> +static void verity_release_req(struct dm_ver_io_data *iodata)
> +{
> +	kfree(iodata->reqdata_arr);
> +	kfree(iodata);
> +}
>  /*
>   * Verify one "dm_verity_io" structure.
>   */
> -static int verity_verify_io(struct dm_verity_io *io)
> +static void verity_verify_io(struct dm_verity_io *io)
>  {
>  	bool is_zero;
>  	struct dm_verity *v = io->v;
> -	struct bvec_iter start;
> -	unsigned b;
> -	struct crypto_wait wait;
> -	struct scatterlist *sg;
> -	int r;
> +	unsigned int b = 0, blocks = 0;
> +	struct dm_ver_io_data *iodata = NULL;
> +	struct dm_ver_req_data *reqdata_arr = NULL;
> +	struct scatterlist *sg = NULL;
> +	int res;
> +
> +	iodata = kmalloc(sizeof(*iodata), GFP_KERNEL);
> +	reqdata_arr = kmalloc_array(io->n_blocks,
> +			sizeof(struct dm_ver_req_data), GFP_KERNEL);
> +	if (unlikely((iodata == NULL) || (reqdata_arr == NULL))) {
> +		WARN_ON((iodata == NULL) || (reqdata_arr == NULL));
> +		goto err_memfree;
> +	}
> +	atomic_set(&iodata->expected_reqs, io->n_blocks);
> +	iodata->reqdata_arr = reqdata_arr;
> +	iodata->io = io;
> +	iodata->total_reqs = blocks = io->n_blocks;
> +
>  
> -	for (b = 0; b < io->n_blocks; b++) {
> +	for (b = 0; b < blocks; b++) {
>  		unsigned int nents;
>  		unsigned int total_len = 0;
>  		unsigned int num_of_buffs = 0;
> -		struct ahash_request *req = verity_io_hash_req(v, io);
>  
> -		/* an extra one for the salt buffer */
> +		reqdata_arr[b].req = ahash_request_alloc(v->tfm, GFP_KERNEL);
> +
> +		if (unlikely(reqdata_arr[b].req == NULL))
> +			goto err_memfree;
> +		/* +1 for the salt buffer */
> +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
>  		num_of_buffs = verity_calc_buffs_for_bv(v, io, io->iter) + 1;
>  		WARN_ON(num_of_buffs < 1);
>  
>  		sg = kmalloc_array(num_of_buffs, sizeof(struct scatterlist),
>  				     GFP_KERNEL);
> -		if (!sg)
> -			return -ENOMEM;
> +		if (!sg) {
> +			DMERR("%s kmalloc_array failed", __func__);
> +			goto err_memfree;
> +		}
> +
>  		sg_init_table(sg, num_of_buffs);
>  
> -		r = verity_hash_for_block(v, io, io->block + b,
> -					  verity_io_want_digest(v, io),
> -					  &is_zero);
> -		if (unlikely(r < 0))
> +		res = verity_hash_for_block(v, io, io->block + b,
> +					    reqdata_arr[b].want_digest,
> +					    &reqdata_arr[b].fec_io,
> +					    &is_zero);
> +		if (unlikely(res < 0))
>  			goto err_memfree;
>  
>  		if (is_zero) {
> @@ -476,56 +585,54 @@ static int verity_verify_io(struct dm_verity_io *io)
>  			 * If we expect a zero block, don't validate, just
>  			 * return zeros.
>  			 */
> -			r = verity_for_bv_block(v, io, &io->iter,
> +			res = verity_for_bv_block(v, io, &io->iter,
>  						verity_bv_zero);
> -			if (unlikely(r < 0))
> +			if (unlikely(res < 0))
>  				goto err_memfree;
> -
> +			verity_cb_complete(reqdata_arr[b].iodata, res);
>  			continue;
>  		}
>  
> -		ahash_request_set_tfm(req, v->tfm);
> -		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
> -					CRYPTO_TFM_REQ_MAY_BACKLOG,
> -					crypto_req_done, (void *)&wait);
>  		nents = 0;
>  		total_len = 0;
>  		if (verity_is_salt_required(v, START_SG))
>  			verity_add_salt(v, sg, &nents, &total_len);
>  
> -		start = io->iter;
> -		verity_for_io_block(v, io, &io->iter, sg, &nents, &total_len);
> +		verity_for_io_block(v, io, &io->iter, sg, &nents,
> +				&total_len);
>  		if (verity_is_salt_required(v, END_SG))
>  			verity_add_salt(v, sg, &nents, &total_len);
> -		/*
> -		 * need to mark end of chain, since we might have allocated
> -		 * more than we actually use
> +		reqdata_arr[b].iodata = iodata;
> +		reqdata_arr[b].sg = sg;
> +		reqdata_arr[b].digest_size = v->digest_size;
> +		reqdata_arr[b].iblock = b;
> +		/* need to mark end of chain,
> +		 * since we might have allocated more than we actually use
>  		 */
>  		sg_mark_end(&sg[nents-1]);
> -		ahash_request_set_crypt(req, sg, verity_io_real_digest(v, io),
> -				total_len);
> -		crypto_init_wait(&wait);
> -		r = crypto_wait_req(crypto_ahash_digest(req), &wait);
>  
> -		if (unlikely(r < 0))
> -			goto err_memfree;
> -		kfree(sg);
> -		if (likely(memcmp(verity_io_real_digest(v, io),
> -				  verity_io_want_digest(v, io), v->digest_size) == 0))
> -			continue;
> -		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
> -					   io->block + b, NULL, &start) == 0)
> -			continue;
> -		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> -					   io->block + b))
> -			return -EIO;
> -	}
> +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
> +		ahash_request_set_callback(reqdata_arr[b].req,
> +			CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG,
> +			single_block_req_done, (void *)&reqdata_arr[b]);
>  
> -	return 0;
> +		ahash_request_set_crypt(reqdata_arr[b].req, sg,
> +			reqdata_arr[b].real_digest, total_len);
> +		res = crypto_ahash_digest(reqdata_arr[b].req);
> +		/* digest completed already, callback won't be called. */
> +		if (res == 0)
> +			__single_block_req_done(&reqdata_arr[b], res, io);
> +
> +	}
> +	return;
>  
>  err_memfree:
> -	kfree(sg);
> -	return r;
> +	DMERR("%s: error occurred", __func__);
> +	/* reduce expected requests by the number of
> +	 * unsent requests, -1 accounting for the current block
> +	 */
> +	atomic_sub(blocks-b-1, &iodata->expected_reqs);
> +	verity_cb_complete(iodata, -EIO);
>  }
>  
>  /*
> @@ -548,7 +655,7 @@ static void verity_work(struct work_struct *w)
>  {
>  	struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
>  
> -	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
> +	verity_verify_io(io);
>  }
>  
>  static void verity_end_io(struct bio *bio)
> diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
> index b675bc0..0e407f6 100644
> --- a/drivers/md/dm-verity.h
> +++ b/drivers/md/dm-verity.h
> @@ -30,6 +30,7 @@ enum verity_block_type {
>  };
>  
>  struct dm_verity_fec;
> +struct dm_verity_fec_io;
>  
>  struct dm_verity {
>  	struct dm_dev *data_dev;
> @@ -124,6 +125,7 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
>  		       const u8 *data, size_t len, u8 *digest);
>  
>  extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
> -				 sector_t block, u8 *digest, bool *is_zero);
> +				 sector_t block, u8 *digest,
> +				 struct dm_verity_fec_io *fio, bool *is_zero);
>  
>  #endif /* DM_VERITY_H */
> -- 
> 2.7.4
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  6:55   ` [dm-devel] " Eric Biggers
@ 2018-03-27  8:05     ` Milan Broz
  2018-03-27  9:09       ` yael.chemla
  2018-03-27  8:50     ` yael.chemla
  2018-04-25 15:13     ` yael.chemla
  2 siblings, 1 reply; 14+ messages in thread
From: Milan Broz @ 2018-03-27  8:05 UTC (permalink / raw)
  To: Eric Biggers, Yael Chemla, Mike Snitzer
  Cc: Alasdair Kergon, dm-devel, linux-kernel, ofir.drang, Yael Chemla,
	linux-crypto, gilad

Mike and others,

did anyone even try to run veritysetup tests?

We have verity-compat-test in our testsuite, is has even basic FEC tests included.

We just added userspace verification of FEC RS codes to compare if kernel behaves the same.

I tried to apply three last dm-verity patches from your tree to Linus mainline.

It does even pass the *first* line of the test script and blocks the kernel forever...
(Running on 32bit Intel VM.)

*NACK* to the last two dm-verity patches.

(The "validate hashes once" is ok, despite I really do not like this approach...)

And comments from Eric are very valid as well, I think all this need to be fixed
before it can go to mainline.

Thanks,
Milan

On 03/27/2018 08:55 AM, Eric Biggers wrote:
> [+Cc linux-crypto]
> 
> Hi Yael,
> 
> On Sun, Mar 25, 2018 at 07:41:30PM +0100, Yael Chemla wrote:
>>  Allow parallel processing of bio blocks by moving to async. completion
>>  handling. This allows for better resource utilization of both HW and
>>  software based hash tfm and therefore better performance in many cases,
>>  depending on the specific tfm in use.
>>  
>>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>>  Time of cat command was measured on a filesystem with various file sizes.
>>  12% performance improvement when HW based hash was used (ccree driver).
>>  SW based hash showed less than 1% improvement.
>>  CPU utilization when HW based hash was used presented 10% less context
>>  switch, 4% less cycles and 7% less instructions. No difference in
>>  CPU utilization noticed with SW based hash.
>>  
>> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> 
> Okay, I definitely would like to see dm-verity better support hardware crypto
> accelerators, but these patches were painful to read.
> 
> There are lots of smaller bugs, but the high-level problem which you need to
> address first is that on every bio you are always allocating all the extra
> memory to hold a hash request and scatterlist for every data block.  This will
> not only hurt performance when the hashing is done in software (I'm skeptical
> that your performance numbers are representative of that case), but it will also
> fall apart under memory pressure.  We are trying to get low-end Android devices
> to start using dm-verity, and such devices often have only 1 GB or even only 512
> MB of RAM, so memory allocations are at increased risk of failing.  In fact I'm
> pretty sure you didn't do any proper stress testing of these patches, since the
> first thing they do for every bio is try to allocate a physically contiguous
> array that is nearly as long as the full bio data itself (n_blocks *
> sizeof(struct dm_verity_req_data) = n_blocks * 3264, at least on a 64-bit
> platform, mostly due to the 'struct dm_verity_fec_io'), so potentially up to
> about 1 MB; that's going to fail a lot even on systems with gigabytes of RAM...
> 
> (You also need to verify that your new code is compatible with the forward error
> correction feature, with the "ignore_zero_blocks" option, and with the new
> "check_at_most_once" option.  From my reading of the code, all of those seemed
> broken; the dm_verity_fec_io structures, for example, weren't even being
> initialized...)
> 
> I think you need to take a close look at how dm-crypt handles async crypto
> implementations, since it seems to do it properly without hurting the common
> case where the crypto happens synchronously.  What it does, is it reserves space
> in the per-bio data for a single cipher request.  Then, *only* if the cipher
> implementation actually processes the request asynchronously (as indicated by
> -EINPROGRESS being returned) is a new cipher request allocated dynamically,
> using a mempool (not kmalloc, which is prone to fail).  Note that unlike your
> patches it also properly handles the case where the hardware crypto queue is
> full, as indicated by the cipher implementation returning -EBUSY; in that case,
> dm-crypt waits to start another request until there is space in the queue.
> 
> I think it would be possible to adapt dm-crypt's solution to dm-verity.
> 
> Thanks,
> 
> Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  6:55   ` [dm-devel] " Eric Biggers
  2018-03-27  8:05     ` Milan Broz
@ 2018-03-27  8:50     ` yael.chemla
  2018-04-25 15:13     ` yael.chemla
  2 siblings, 0 replies; 14+ messages in thread
From: yael.chemla @ 2018-03-27  8:50 UTC (permalink / raw)
  To: 'Eric Biggers'
  Cc: 'Mike Snitzer', 'Alasdair Kergon',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	linux-crypto, gilad

Hi Eric,
Thanks for the detailed feedback, I'll have a look at how  dm-crypt avoid dynamic allocation per-bio, and also do forward error correction tests.
Yael

-----Original Message-----
From: Eric Biggers <ebiggers3@gmail.com> 
Sent: Tuesday, 27 March 2018 9:55
To: Yael Chemla <yael.chemla@foss.arm.com>
Cc: Alasdair Kergon <agk@redhat.com>; Mike Snitzer <snitzer@redhat.com>; dm-devel@redhat.com; linux-kernel@vger.kernel.org; ofir.drang@gmail.com; Yael Chemla <yael.chemla@arm.com>; linux-crypto@vger.kernel.org; gilad@benyossef.com
Subject: Re: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks

[+Cc linux-crypto]

Hi Yael,

On Sun, Mar 25, 2018 at 07:41:30PM +0100, Yael Chemla wrote:
>  Allow parallel processing of bio blocks by moving to async. 
> completion  handling. This allows for better resource utilization of 
> both HW and  software based hash tfm and therefore better performance 
> in many cases,  depending on the specific tfm in use.
>  
>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>  Time of cat command was measured on a filesystem with various file sizes.
>  12% performance improvement when HW based hash was used (ccree driver).
>  SW based hash showed less than 1% improvement.
>  CPU utilization when HW based hash was used presented 10% less 
> context  switch, 4% less cycles and 7% less instructions. No 
> difference in  CPU utilization noticed with SW based hash.
>  
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>

Okay, I definitely would like to see dm-verity better support hardware crypto accelerators, but these patches were painful to read.

There are lots of smaller bugs, but the high-level problem which you need to address first is that on every bio you are always allocating all the extra memory to hold a hash request and scatterlist for every data block.  This will not only hurt performance when the hashing is done in software (I'm skeptical that your performance numbers are representative of that case), but it will also fall apart under memory pressure.  We are trying to get low-end Android devices to start using dm-verity, and such devices often have only 1 GB or even only 512 MB of RAM, so memory allocations are at increased risk of failing.  In fact I'm pretty sure you didn't do any proper stress testing of these patches, since the first thing they do for every bio is try to allocate a physically contiguous array that is nearly as long as the full bio data itself (n_blocks * sizeof(struct dm_verity_req_data) = n_blocks * 3264, at least on a 64-bit platform, mostly due to the 'struct dm_verity_fec_io'), so potentially up to about 1 MB; that's going to fail a lot even on systems with gigabytes of RAM...

(You also need to verify that your new code is compatible with the forward error correction feature, with the "ignore_zero_blocks" option, and with the new "check_at_most_once" option.  From my reading of the code, all of those seemed broken; the dm_verity_fec_io structures, for example, weren't even being
initialized...)

I think you need to take a close look at how dm-crypt handles async crypto implementations, since it seems to do it properly without hurting the common case where the crypto happens synchronously.  What it does, is it reserves space in the per-bio data for a single cipher request.  Then, *only* if the cipher implementation actually processes the request asynchronously (as indicated by -EINPROGRESS being returned) is a new cipher request allocated dynamically, using a mempool (not kmalloc, which is prone to fail).  Note that unlike your patches it also properly handles the case where the hardware crypto queue is full, as indicated by the cipher implementation returning -EBUSY; in that case, dm-crypt waits to start another request until there is space in the queue.

I think it would be possible to adapt dm-crypt's solution to dm-verity.

Thanks,

Eric

> ---
>  drivers/md/dm-verity-fec.c    |  10 +-
>  drivers/md/dm-verity-fec.h    |   7 +-
>  drivers/md/dm-verity-target.c | 215 +++++++++++++++++++++++++++++++-----------
>  drivers/md/dm-verity.h        |   4 +-
>  4 files changed, 173 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c 
> index e13f908..bcea307 100644
> --- a/drivers/md/dm-verity-fec.c
> +++ b/drivers/md/dm-verity-fec.c
> @@ -203,13 +203,12 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
>   */
>  static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
>  			 u64 rsb, u64 target, unsigned block_offset,
> -			 int *neras)
> +			 int *neras, struct dm_verity_fec_io *fio)
>  {
>  	bool is_zero;
>  	int i, j, target_index = -1;
>  	struct dm_buffer *buf;
>  	struct dm_bufio_client *bufio;
> -	struct dm_verity_fec_io *fio = fec_io(io);
>  	u64 block, ileaved;
>  	u8 *bbuf, *rs_block;
>  	u8 want_digest[v->digest_size];
> @@ -265,7 +264,7 @@ static int fec_read_bufs(struct dm_verity *v, 
> struct dm_verity_io *io,
>  
>  		/* locate erasures if the block is on the data device */
>  		if (bufio == v->fec->data_bufio &&
> -		    verity_hash_for_block(v, io, block, want_digest,
> +		    verity_hash_for_block(v, io, block, want_digest, fio,
>  					  &is_zero) == 0) {
>  			/* skip known zero blocks entirely */
>  			if (is_zero)
> @@ -374,7 +373,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
>  		fec_init_bufs(v, fio);
>  
>  		r = fec_read_bufs(v, io, rsb, offset, pos,
> -				  use_erasures ? &neras : NULL);
> +				  use_erasures ? &neras : NULL, fio);
>  		if (unlikely(r < 0))
>  			return r;
>  
> @@ -419,10 +418,9 @@ static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
>   */
>  int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
>  		      enum verity_block_type type, sector_t block, u8 *dest,
> -		      struct bvec_iter *iter)
> +		      struct bvec_iter *iter, struct dm_verity_fec_io *fio)
>  {
>  	int r;
> -	struct dm_verity_fec_io *fio = fec_io(io);
>  	u64 offset, res, rsb;
>  
>  	if (!verity_fec_is_enabled(v))
> diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h 
> index bb31ce8..46c2f47 100644
> --- a/drivers/md/dm-verity-fec.h
> +++ b/drivers/md/dm-verity-fec.h
> @@ -73,7 +73,9 @@ extern bool verity_fec_is_enabled(struct dm_verity 
> *v);
>  
>  extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
>  			     enum verity_block_type type, sector_t block,
> -			     u8 *dest, struct bvec_iter *iter);
> +			     u8 *dest, struct bvec_iter *iter,
> +			     struct dm_verity_fec_io *fio);
> +
>  
>  extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
>  					char *result, unsigned maxlen);
> @@ -104,7 +106,8 @@ static inline int verity_fec_decode(struct dm_verity *v,
>  				    struct dm_verity_io *io,
>  				    enum verity_block_type type,
>  				    sector_t block, u8 *dest,
> -				    struct bvec_iter *iter)
> +				    struct bvec_iter *iter,
> +				    struct dm_verity_fec_io *fio)
>  {
>  	return -EOPNOTSUPP;
>  }
> diff --git a/drivers/md/dm-verity-target.c 
> b/drivers/md/dm-verity-target.c index a281b83..30512ee 100644
> --- a/drivers/md/dm-verity-target.c
> +++ b/drivers/md/dm-verity-target.c
> @@ -38,7 +38,35 @@
>  /* only two elements in static scatter list: salt and data */
>  #define SG_FIXED_ITEMS	2
>  
> +struct dm_ver_io_data {
> +	atomic_t expected_reqs;
> +	atomic_t err;
> +	int total_reqs;
> +	struct dm_ver_req_data *reqdata_arr;
> +	struct dm_verity_io *io;
> +};
> +
> +#define MAX_DIGEST_SIZE 64 /* as big as sha512 digest size */
> +
> +struct dm_ver_req_data {
> +	u8 want_digest[MAX_DIGEST_SIZE];
> +	u8 real_digest[MAX_DIGEST_SIZE];
> +	struct dm_verity_fec_io fec_io;
> +	unsigned int iblock;	// block index in the request
> +	unsigned int digest_size;
> +	struct scatterlist *sg;
> +	struct dm_ver_io_data *iodata;
> +	/* req field is the last on purpose since it's not fixed in size and
> +	 *  its size if calucluated using ahash_request_alloc or an addition of
> +	 *  the required size is done with +crypto_ahash_reqsize(v->tfm)
> +	 */
> +	struct ahash_request *req;
> +};
> +
>  static unsigned dm_verity_prefetch_cluster = 
> DM_VERITY_DEFAULT_PREFETCH_SIZE;
> +static void verity_finish_io(struct dm_verity_io *io, blk_status_t 
> +status); static void verity_release_req(struct dm_ver_io_data 
> +*iodata);
> +
>  
>  module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, 
> uint, S_IRUGO | S_IWUSR);
>  
> @@ -102,7 +130,7 @@ static sector_t verity_position_at_level(struct 
> dm_verity *v, sector_t block,
>  
>  /*
>   * verity_is_salt_required - check if according to verity version and
> - * verity salt's size if there's a need to insert a salt.
> + * verity salt's size there's a need to insert a salt.
>   * note: salt goes last for 0th version and first for all others
>   * @where - START_SG - before buffer / END_SG - after buffer
>   */
> @@ -247,7 +275,7 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
>   */
>  static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>  			       sector_t block, int level, bool skip_unverified,
> -			       u8 *want_digest)
> +			       u8 *want_digest, struct dm_verity_fec_io *fec_io)
>  {
>  	struct dm_buffer *buf;
>  	struct buffer_aux *aux;
> @@ -281,7 +309,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>  			aux->hash_verified = 1;
>  		else if (verity_fec_decode(v, io,
>  					   DM_VERITY_BLOCK_TYPE_METADATA,
> -					   hash_block, data, NULL) == 0)
> +					   hash_block, data, NULL, fec_io) == 0)
>  			aux->hash_verified = 1;
>  		else if (verity_handle_err(v,
>  					   DM_VERITY_BLOCK_TYPE_METADATA, @@ -305,7 +333,9 @@ static int 
> verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
>   * of the hash tree if necessary.
>   */
>  int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
> -			  sector_t block, u8 *digest, bool *is_zero)
> +			  sector_t block, u8 *digest,
> +			  struct dm_verity_fec_io *fec_io,
> +			  bool *is_zero)
>  {
>  	int r = 0, i;
>  
> @@ -317,7 +347,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
>  		 * function returns 1 and we fall back to whole
>  		 * chain verification.
>  		 */
> -		r = verity_verify_level(v, io, block, 0, true, digest);
> +		r = verity_verify_level(v, io, block, 0, true, digest, fec_io);
>  		if (likely(r <= 0))
>  			goto out;
>  	}
> @@ -325,7 +355,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
>  	memcpy(digest, v->root_digest, v->digest_size);
>  
>  	for (i = v->levels - 1; i >= 0; i--) {
> -		r = verity_verify_level(v, io, block, i, false, digest);
> +		r = verity_verify_level(v, io, block, i, false, digest, fec_io);
>  		if (unlikely(r))
>  			goto out;
>  	}
> @@ -436,39 +466,118 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
>  	return 0;
>  }
>  
> +
> +static void verity_cb_complete(struct dm_ver_io_data *iodata, int 
> +err) {
> +	struct dm_verity_io *io = iodata->io;
> +
> +	/* save last error occurred */
> +	if (err)
> +		atomic_set(&iodata->err, err);
> +	if (atomic_dec_and_test(&iodata->expected_reqs)) {
> +		verity_release_req(iodata);
> +		verity_finish_io(io, errno_to_blk_status(err));
> +	}
> +}
> +
> +static void __single_block_req_done(struct dm_ver_req_data *req_data,
> +					int err, struct dm_verity_io *io) {
> +	struct dm_verity *v = io->v;
> +
> +	if (err == -EINPROGRESS)
> +		return;
> +
> +	WARN_ON((err != 0) || (req_data == NULL) || (req_data->iodata == NULL));
> +	if ((err != 0) || (req_data == NULL) || (req_data->iodata == NULL))
> +		goto complete;
> +
> +	kfree(req_data->sg);
> +
> +	if (likely(memcmp(req_data->real_digest,
> +			req_data->want_digest, req_data->digest_size) == 0))
> +		goto complete;
> +	else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
> +			io->block + req_data->iblock, NULL, &io->iter,
> +			&req_data->fec_io) == 0){
> +		goto complete;
> +	} else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> +		req_data->iodata->io->block + req_data->iblock)){
> +		err = -EIO;
> +		goto complete;
> +	}
> +
> +complete:
> +	ahash_request_free(req_data->req);
> +	verity_cb_complete(req_data->iodata, err); }
> +
> +static void single_block_req_done(struct crypto_async_request *req, 
> +int err) {
> +	struct dm_ver_req_data *req_data = req->data;
> +
> +	__single_block_req_done(req_data, err, req_data->iodata->io); }
> +
> +static void verity_release_req(struct dm_ver_io_data *iodata) {
> +	kfree(iodata->reqdata_arr);
> +	kfree(iodata);
> +}
>  /*
>   * Verify one "dm_verity_io" structure.
>   */
> -static int verity_verify_io(struct dm_verity_io *io)
> +static void verity_verify_io(struct dm_verity_io *io)
>  {
>  	bool is_zero;
>  	struct dm_verity *v = io->v;
> -	struct bvec_iter start;
> -	unsigned b;
> -	struct crypto_wait wait;
> -	struct scatterlist *sg;
> -	int r;
> +	unsigned int b = 0, blocks = 0;
> +	struct dm_ver_io_data *iodata = NULL;
> +	struct dm_ver_req_data *reqdata_arr = NULL;
> +	struct scatterlist *sg = NULL;
> +	int res;
> +
> +	iodata = kmalloc(sizeof(*iodata), GFP_KERNEL);
> +	reqdata_arr = kmalloc_array(io->n_blocks,
> +			sizeof(struct dm_ver_req_data), GFP_KERNEL);
> +	if (unlikely((iodata == NULL) || (reqdata_arr == NULL))) {
> +		WARN_ON((iodata == NULL) || (reqdata_arr == NULL));
> +		goto err_memfree;
> +	}
> +	atomic_set(&iodata->expected_reqs, io->n_blocks);
> +	iodata->reqdata_arr = reqdata_arr;
> +	iodata->io = io;
> +	iodata->total_reqs = blocks = io->n_blocks;
> +
>  
> -	for (b = 0; b < io->n_blocks; b++) {
> +	for (b = 0; b < blocks; b++) {
>  		unsigned int nents;
>  		unsigned int total_len = 0;
>  		unsigned int num_of_buffs = 0;
> -		struct ahash_request *req = verity_io_hash_req(v, io);
>  
> -		/* an extra one for the salt buffer */
> +		reqdata_arr[b].req = ahash_request_alloc(v->tfm, GFP_KERNEL);
> +
> +		if (unlikely(reqdata_arr[b].req == NULL))
> +			goto err_memfree;
> +		/* +1 for the salt buffer */
> +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
>  		num_of_buffs = verity_calc_buffs_for_bv(v, io, io->iter) + 1;
>  		WARN_ON(num_of_buffs < 1);
>  
>  		sg = kmalloc_array(num_of_buffs, sizeof(struct scatterlist),
>  				     GFP_KERNEL);
> -		if (!sg)
> -			return -ENOMEM;
> +		if (!sg) {
> +			DMERR("%s kmalloc_array failed", __func__);
> +			goto err_memfree;
> +		}
> +
>  		sg_init_table(sg, num_of_buffs);
>  
> -		r = verity_hash_for_block(v, io, io->block + b,
> -					  verity_io_want_digest(v, io),
> -					  &is_zero);
> -		if (unlikely(r < 0))
> +		res = verity_hash_for_block(v, io, io->block + b,
> +					    reqdata_arr[b].want_digest,
> +					    &reqdata_arr[b].fec_io,
> +					    &is_zero);
> +		if (unlikely(res < 0))
>  			goto err_memfree;
>  
>  		if (is_zero) {
> @@ -476,56 +585,54 @@ static int verity_verify_io(struct dm_verity_io *io)
>  			 * If we expect a zero block, don't validate, just
>  			 * return zeros.
>  			 */
> -			r = verity_for_bv_block(v, io, &io->iter,
> +			res = verity_for_bv_block(v, io, &io->iter,
>  						verity_bv_zero);
> -			if (unlikely(r < 0))
> +			if (unlikely(res < 0))
>  				goto err_memfree;
> -
> +			verity_cb_complete(reqdata_arr[b].iodata, res);
>  			continue;
>  		}
>  
> -		ahash_request_set_tfm(req, v->tfm);
> -		ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
> -					CRYPTO_TFM_REQ_MAY_BACKLOG,
> -					crypto_req_done, (void *)&wait);
>  		nents = 0;
>  		total_len = 0;
>  		if (verity_is_salt_required(v, START_SG))
>  			verity_add_salt(v, sg, &nents, &total_len);
>  
> -		start = io->iter;
> -		verity_for_io_block(v, io, &io->iter, sg, &nents, &total_len);
> +		verity_for_io_block(v, io, &io->iter, sg, &nents,
> +				&total_len);
>  		if (verity_is_salt_required(v, END_SG))
>  			verity_add_salt(v, sg, &nents, &total_len);
> -		/*
> -		 * need to mark end of chain, since we might have allocated
> -		 * more than we actually use
> +		reqdata_arr[b].iodata = iodata;
> +		reqdata_arr[b].sg = sg;
> +		reqdata_arr[b].digest_size = v->digest_size;
> +		reqdata_arr[b].iblock = b;
> +		/* need to mark end of chain,
> +		 * since we might have allocated more than we actually use
>  		 */
>  		sg_mark_end(&sg[nents-1]);
> -		ahash_request_set_crypt(req, sg, verity_io_real_digest(v, io),
> -				total_len);
> -		crypto_init_wait(&wait);
> -		r = crypto_wait_req(crypto_ahash_digest(req), &wait);
>  
> -		if (unlikely(r < 0))
> -			goto err_memfree;
> -		kfree(sg);
> -		if (likely(memcmp(verity_io_real_digest(v, io),
> -				  verity_io_want_digest(v, io), v->digest_size) == 0))
> -			continue;
> -		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
> -					   io->block + b, NULL, &start) == 0)
> -			continue;
> -		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> -					   io->block + b))
> -			return -EIO;
> -	}
> +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
> +		ahash_request_set_callback(reqdata_arr[b].req,
> +			CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG,
> +			single_block_req_done, (void *)&reqdata_arr[b]);
>  
> -	return 0;
> +		ahash_request_set_crypt(reqdata_arr[b].req, sg,
> +			reqdata_arr[b].real_digest, total_len);
> +		res = crypto_ahash_digest(reqdata_arr[b].req);
> +		/* digest completed already, callback won't be called. */
> +		if (res == 0)
> +			__single_block_req_done(&reqdata_arr[b], res, io);
> +
> +	}
> +	return;
>  
>  err_memfree:
> -	kfree(sg);
> -	return r;
> +	DMERR("%s: error occurred", __func__);
> +	/* reduce expected requests by the number of
> +	 * unsent requests, -1 accounting for the current block
> +	 */
> +	atomic_sub(blocks-b-1, &iodata->expected_reqs);
> +	verity_cb_complete(iodata, -EIO);
>  }
>  
>  /*
> @@ -548,7 +655,7 @@ static void verity_work(struct work_struct *w)  {
>  	struct dm_verity_io *io = container_of(w, struct dm_verity_io, 
> work);
>  
> -	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
> +	verity_verify_io(io);
>  }
>  
>  static void verity_end_io(struct bio *bio) diff --git 
> a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 
> b675bc0..0e407f6 100644
> --- a/drivers/md/dm-verity.h
> +++ b/drivers/md/dm-verity.h
> @@ -30,6 +30,7 @@ enum verity_block_type {  };
>  
>  struct dm_verity_fec;
> +struct dm_verity_fec_io;
>  
>  struct dm_verity {
>  	struct dm_dev *data_dev;
> @@ -124,6 +125,7 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
>  		       const u8 *data, size_t len, u8 *digest);
>  
>  extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
> -				 sector_t block, u8 *digest, bool *is_zero);
> +				 sector_t block, u8 *digest,
> +				 struct dm_verity_fec_io *fio, bool *is_zero);
>  
>  #endif /* DM_VERITY_H */
> --
> 2.7.4
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  1:06   ` Mike Snitzer
  2018-03-27  6:52     ` Gilad Ben-Yossef
@ 2018-03-27  8:55     ` yael.chemla
  2018-03-27 13:16       ` Mike Snitzer
  1 sibling, 1 reply; 14+ messages in thread
From: yael.chemla @ 2018-03-27  8:55 UTC (permalink / raw)
  To: 'Mike Snitzer'
  Cc: 'Alasdair Kergon',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	'Eric Biggers'

Hi Mike
I need to rewrite these patches according to issues you and Eric Biggers mentioned.
please drop this v1 patch.
Thank you,
Yael

-----Original Message-----
From: Mike Snitzer <snitzer@redhat.com> 
Sent: Tuesday, 27 March 2018 4:07
To: Yael Chemla <yael.chemla@foss.arm.com>
Cc: Alasdair Kergon <agk@redhat.com>; dm-devel@redhat.com; linux-kernel@vger.kernel.org; ofir.drang@gmail.com; Yael Chemla <yael.chemla@arm.com>
Subject: Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks

On Sun, Mar 25 2018 at  2:41pm -0400,
Yael Chemla <yael.chemla@foss.arm.com> wrote:

>  Allow parallel processing of bio blocks by moving to async. 
> completion  handling. This allows for better resource utilization of 
> both HW and  software based hash tfm and therefore better performance 
> in many cases,  depending on the specific tfm in use.
>  
>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>  Time of cat command was measured on a filesystem with various file sizes.
>  12% performance improvement when HW based hash was used (ccree driver).
>  SW based hash showed less than 1% improvement.
>  CPU utilization when HW based hash was used presented 10% less 
> context  switch, 4% less cycles and 7% less instructions. No 
> difference in  CPU utilization noticed with SW based hash.
>  
> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>

This one had various issues.  I've fixed most of what I saw and staged in linux-next (purely for build test coverage purposes).  I may drop this patch if others disagree with it (or my sg deallocation in the error path question isn't answered).

I've staged the changes here (and in linux-next via 'for-next'):
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/log/?h=dm-4.17

I switched all the new GFP_KERNEL uses to GFP_NOIO.  The fact that you're doing allocations at all (per IO) is bad enough.  Using GFP_KERNEL is a serious liability (risk of deadlock if dm-verity were to be used for something like.. swap.. weird setup but possible).

But the gfp flags aside, the need for additional memory and the expectation of scalable async parallel IO is potentially at odds with changes like this (that I just staged, and had to rebase your 2 patches ontop of):
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-4.17&id=a89f6a2cfec86fba7a115642ff082cb4e9450ea6

So I'm particulalry interested to hear from google folks to understand if they are OK with your proposed verity async crypto API use.

Mike

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  8:05     ` Milan Broz
@ 2018-03-27  9:09       ` yael.chemla
  0 siblings, 0 replies; 14+ messages in thread
From: yael.chemla @ 2018-03-27  9:09 UTC (permalink / raw)
  To: 'Milan Broz', 'Eric Biggers', 'Mike Snitzer'
  Cc: 'Alasdair Kergon',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	linux-crypto, gilad

Hi Milan,
I will run veritysetup test on next version of these patches and contact you about verity-compat-test testsuits.
Thank you,
Yael

-----Original Message-----
From: Milan Broz <gmazyland@gmail.com> 
Sent: Tuesday, 27 March 2018 11:05
To: Eric Biggers <ebiggers3@gmail.com>; Yael Chemla <yael.chemla@foss.arm.com>; Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>; dm-devel@redhat.com; linux-kernel@vger.kernel.org; ofir.drang@gmail.com; Yael Chemla <yael.chemla@arm.com>; linux-crypto@vger.kernel.org; gilad@benyossef.com
Subject: Re: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks

Mike and others,

did anyone even try to run veritysetup tests?

We have verity-compat-test in our testsuite, is has even basic FEC tests included.

We just added userspace verification of FEC RS codes to compare if kernel behaves the same.

I tried to apply three last dm-verity patches from your tree to Linus mainline.

It does even pass the *first* line of the test script and blocks the kernel forever...
(Running on 32bit Intel VM.)

*NACK* to the last two dm-verity patches.

(The "validate hashes once" is ok, despite I really do not like this approach...)

And comments from Eric are very valid as well, I think all this need to be fixed before it can go to mainline.

Thanks,
Milan

On 03/27/2018 08:55 AM, Eric Biggers wrote:
> [+Cc linux-crypto]
> 
> Hi Yael,
> 
> On Sun, Mar 25, 2018 at 07:41:30PM +0100, Yael Chemla wrote:
>>  Allow parallel processing of bio blocks by moving to async. 
>> completion  handling. This allows for better resource utilization of 
>> both HW and  software based hash tfm and therefore better performance 
>> in many cases,  depending on the specific tfm in use.
>>  
>>  Tested on ARM32 (zynq board) and ARM64 (Juno board).
>>  Time of cat command was measured on a filesystem with various file sizes.
>>  12% performance improvement when HW based hash was used (ccree driver).
>>  SW based hash showed less than 1% improvement.
>>  CPU utilization when HW based hash was used presented 10% less 
>> context  switch, 4% less cycles and 7% less instructions. No 
>> difference in  CPU utilization noticed with SW based hash.
>>  
>> Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> 
> Okay, I definitely would like to see dm-verity better support hardware 
> crypto accelerators, but these patches were painful to read.
> 
> There are lots of smaller bugs, but the high-level problem which you 
> need to address first is that on every bio you are always allocating 
> all the extra memory to hold a hash request and scatterlist for every 
> data block.  This will not only hurt performance when the hashing is 
> done in software (I'm skeptical that your performance numbers are 
> representative of that case), but it will also fall apart under memory 
> pressure.  We are trying to get low-end Android devices to start using 
> dm-verity, and such devices often have only 1 GB or even only 512 MB 
> of RAM, so memory allocations are at increased risk of failing.  In 
> fact I'm pretty sure you didn't do any proper stress testing of these 
> patches, since the first thing they do for every bio is try to 
> allocate a physically contiguous array that is nearly as long as the 
> full bio data itself (n_blocks * sizeof(struct dm_verity_req_data) = 
> n_blocks * 3264, at least on a 64-bit platform, mostly due to the 'struct dm_verity_fec_io'), so potentially up to about 1 MB; that's going to fail a lot even on systems with gigabytes of RAM...
> 
> (You also need to verify that your new code is compatible with the 
> forward error correction feature, with the "ignore_zero_blocks" 
> option, and with the new "check_at_most_once" option.  From my reading 
> of the code, all of those seemed broken; the dm_verity_fec_io 
> structures, for example, weren't even being
> initialized...)
> 
> I think you need to take a close look at how dm-crypt handles async 
> crypto implementations, since it seems to do it properly without 
> hurting the common case where the crypto happens synchronously.  What 
> it does, is it reserves space in the per-bio data for a single cipher 
> request.  Then, *only* if the cipher implementation actually processes 
> the request asynchronously (as indicated by -EINPROGRESS being 
> returned) is a new cipher request allocated dynamically, using a 
> mempool (not kmalloc, which is prone to fail).  Note that unlike your 
> patches it also properly handles the case where the hardware crypto 
> queue is full, as indicated by the cipher implementation returning -EBUSY; in that case, dm-crypt waits to start another request until there is space in the queue.
> 
> I think it would be possible to adapt dm-crypt's solution to dm-verity.
> 
> Thanks,
> 
> Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  8:55     ` yael.chemla
@ 2018-03-27 13:16       ` Mike Snitzer
  2018-03-27 21:27         ` yael.chemla
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Snitzer @ 2018-03-27 13:16 UTC (permalink / raw)
  To: yael.chemla
  Cc: 'Alasdair Kergon',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	'Eric Biggers'

On Tue, Mar 27 2018 at  4:55am -0400,
yael.chemla@foss.arm.com <yael.chemla@foss.arm.com> wrote:

> Hi Mike
> I need to rewrite these patches according to issues you and Eric Biggers mentioned.
> please drop this v1 patch.

They've been dropped.  BUT please do note that the patches I pushed to
linux-dm.git were rebased ontop of the 'check_at_most_once' patch.

I never did get an answer about how the sg array is free'd in certain
error paths (see "FIXME:" in the 2nd patch).

Also, I fixed some issues I saw in error paths, and lots of formatting.

I'll be pretty frustrated if you submit v2 that is blind to the kinds of
changes I made.

I'll send you a private copy of the patches just so you have them for
your reference.

Thanks,
Mike


> -----Original Message-----
> From: Mike Snitzer <snitzer@redhat.com> 
> Sent: Tuesday, 27 March 2018 4:07
> To: Yael Chemla <yael.chemla@foss.arm.com>
> Cc: Alasdair Kergon <agk@redhat.com>; dm-devel@redhat.com; linux-kernel@vger.kernel.org; ofir.drang@gmail.com; Yael Chemla <yael.chemla@arm.com>
> Subject: Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
> 
> On Sun, Mar 25 2018 at  2:41pm -0400,
> Yael Chemla <yael.chemla@foss.arm.com> wrote:
> 
> >  Allow parallel processing of bio blocks by moving to async. 
> > completion  handling. This allows for better resource utilization of 
> > both HW and  software based hash tfm and therefore better performance 
> > in many cases,  depending on the specific tfm in use.
> >  
> >  Tested on ARM32 (zynq board) and ARM64 (Juno board).
> >  Time of cat command was measured on a filesystem with various file sizes.
> >  12% performance improvement when HW based hash was used (ccree driver).
> >  SW based hash showed less than 1% improvement.
> >  CPU utilization when HW based hash was used presented 10% less 
> > context  switch, 4% less cycles and 7% less instructions. No 
> > difference in  CPU utilization noticed with SW based hash.
> >  
> > Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> 
> This one had various issues.  I've fixed most of what I saw and staged in linux-next (purely for build test coverage purposes).  I may drop this patch if others disagree with it (or my sg deallocation in the error path question isn't answered).
> 
> I've staged the changes here (and in linux-next via 'for-next'):
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/log/?h=dm-4.17
> 
> I switched all the new GFP_KERNEL uses to GFP_NOIO.  The fact that you're doing allocations at all (per IO) is bad enough.  Using GFP_KERNEL is a serious liability (risk of deadlock if dm-verity were to be used for something like.. swap.. weird setup but possible).
> 
> But the gfp flags aside, the need for additional memory and the expectation of scalable async parallel IO is potentially at odds with changes like this (that I just staged, and had to rebase your 2 patches ontop of):
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-4.17&id=a89f6a2cfec86fba7a115642ff082cb4e9450ea6
> 
> So I'm particulalry interested to hear from google folks to understand if they are OK with your proposed verity async crypto API use.
> 
> Mike
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27 13:16       ` Mike Snitzer
@ 2018-03-27 21:27         ` yael.chemla
  0 siblings, 0 replies; 14+ messages in thread
From: yael.chemla @ 2018-03-27 21:27 UTC (permalink / raw)
  To: 'Mike Snitzer'
  Cc: 'Alasdair Kergon',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	'Eric Biggers'



> -----Original Message-----
> From: Mike Snitzer <snitzer@redhat.com>
> Sent: Tuesday, 27 March 2018 16:17
> To: yael.chemla@foss.arm.com
> Cc: 'Alasdair Kergon' <agk@redhat.com>; dm-devel@redhat.com; linux-
> kernel@vger.kernel.org; ofir.drang@gmail.com; 'Yael Chemla'
> <yael.chemla@arm.com>; 'Eric Biggers' <ebiggers3@gmail.com>
> Subject: Re: [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
> 
> On Tue, Mar 27 2018 at  4:55am -0400,
> yael.chemla@foss.arm.com <yael.chemla@foss.arm.com> wrote:
> 
> > Hi Mike
> > I need to rewrite these patches according to issues you and Eric Biggers
> mentioned.
> > please drop this v1 patch.
> 
> They've been dropped.  BUT please do note that the patches I pushed to
> linux-dm.git were rebased ontop of the 'check_at_most_once' patch.

Thank you so much for many style, formatting and other issues fixes and also for
integration of 'check_at_most_once' patch, it saved me several review iterations.

> 
> I never did get an answer about how the sg array is free'd in certain error
> paths (see "FIXME:" in the 2nd patch).
> 

Regarding free of sg in two error paths, you were correct.
I fixed it by placing several error labels to differentiate each handling.
I also noted that reqdata_arr[b].req was not released properly, this is also fixed.
following is a diff of my fix based on your modifications.
(I can send it in a patch format, but it doesn't include a fix for Eric Biggers comments)


@@ -573,10 +573,9 @@ static void verity_verify_io(struct dm_verity_io *io)
                        verity_bv_skip_block(v, io, &io->iter);
                        continue;
                }
-
                reqdata_arr[b].req = ahash_request_alloc(v->tfm, GFP_NOIO);
                if (unlikely(reqdata_arr[b].req == NULL))
-                       goto err_memfree;
+                       goto err_mem_req;
                ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
 
                /* +1 for the salt buffer */
@@ -586,7 +585,7 @@ static void verity_verify_io(struct dm_verity_io *io)
                                   GFP_NOIO);
                if (!sg) {
                        DMERR_LIMIT("%s: kmalloc_array failed", __func__);
-                       goto err_memfree;
+                       goto err_mem_sg;
                }
                sg_init_table(sg, num_of_buffs);
                // FIXME: if we 'err_memfree' (or continue;) below how does this sg get kfree()'d?
@@ -595,7 +594,7 @@ static void verity_verify_io(struct dm_verity_io *io)
                                          reqdata_arr[b].want_digest,
                                          &reqdata_arr[b].fec_io, &is_zero);
                if (unlikely(r < 0))
-                       goto err_memfree;
+                       goto err_mem;
 
                if (is_zero) {
                        /*
@@ -605,7 +604,7 @@ static void verity_verify_io(struct dm_verity_io *io)
                        r = verity_for_bv_block(v, io, &io->iter,
                                                verity_bv_zero);
                        if (unlikely(r < 0))
-                               goto err_memfree;
+                               goto err_mem;
                        verity_cb_complete(iodata, r);
                        continue;
                }
@@ -644,7 +643,11 @@ static void verity_verify_io(struct dm_verity_io *io)
        }
        return;
 
-err_memfree:
+err_mem:
+       kfree(sg);
+err_mem_sg:
+       ahash_request_free(reqdata_arr[b].req);
+err_mem_req:
        /*
         * reduce expected requests by the number of unsent
         * requests, -1 accounting for the current block
        atomic_sub(blocks - b - 1, &iodata->expected_reqs);
        verity_cb_complete(iodata, -EIO);

> Also, I fixed some issues I saw in error paths, and lots of formatting.
> 
> I'll be pretty frustrated if you submit v2 that is blind to the kinds of changes I
> made.
> 

I took your modifications and working upon it. 

> I'll send you a private copy of the patches just so you have them for your
> reference.
> 
> Thanks,
> Mike
> 
> 
> > -----Original Message-----
> > From: Mike Snitzer <snitzer@redhat.com>
> > Sent: Tuesday, 27 March 2018 4:07
> > To: Yael Chemla <yael.chemla@foss.arm.com>
> > Cc: Alasdair Kergon <agk@redhat.com>; dm-devel@redhat.com;
> > linux-kernel@vger.kernel.org; ofir.drang@gmail.com; Yael Chemla
> > <yael.chemla@arm.com>
> > Subject: Re: [PATCH 2/2] md: dm-verity: allow parallel processing of
> > bio blocks
> >
> > On Sun, Mar 25 2018 at  2:41pm -0400,
> > Yael Chemla <yael.chemla@foss.arm.com> wrote:
> >
> > >  Allow parallel processing of bio blocks by moving to async.
> > > completion  handling. This allows for better resource utilization of
> > > both HW and  software based hash tfm and therefore better
> > > performance in many cases,  depending on the specific tfm in use.
> > >
> > >  Tested on ARM32 (zynq board) and ARM64 (Juno board).
> > >  Time of cat command was measured on a filesystem with various file sizes.
> > >  12% performance improvement when HW based hash was used (ccree
> driver).
> > >  SW based hash showed less than 1% improvement.
> > >  CPU utilization when HW based hash was used presented 10% less
> > > context  switch, 4% less cycles and 7% less instructions. No
> > > difference in  CPU utilization noticed with SW based hash.
> > >
> > > Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> >
> > This one had various issues.  I've fixed most of what I saw and staged in
> linux-next (purely for build test coverage purposes).  I may drop this patch if
> others disagree with it (or my sg deallocation in the error path question isn't
> answered).
> >
> > I've staged the changes here (and in linux-next via 'for-next'):
> > https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
> > .git/log/?h=dm-4.17
> >
> > I switched all the new GFP_KERNEL uses to GFP_NOIO.  The fact that you're
> doing allocations at all (per IO) is bad enough.  Using GFP_KERNEL is a serious
> liability (risk of deadlock if dm-verity were to be used for something like..
> swap.. weird setup but possible).
> >
> > But the gfp flags aside, the need for additional memory and the expectation
> of scalable async parallel IO is potentially at odds with changes like this (that I
> just staged, and had to rebase your 2 patches ontop of):
> > https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
> > .git/commit/?h=dm-4.17&id=a89f6a2cfec86fba7a115642ff082cb4e9450ea6
> >
> > So I'm particulalry interested to hear from google folks to understand if they
> are OK with your proposed verity async crypto API use.
> >
> > Mike
> >

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks
  2018-03-27  6:55   ` [dm-devel] " Eric Biggers
  2018-03-27  8:05     ` Milan Broz
  2018-03-27  8:50     ` yael.chemla
@ 2018-04-25 15:13     ` yael.chemla
  2 siblings, 0 replies; 14+ messages in thread
From: yael.chemla @ 2018-04-25 15:13 UTC (permalink / raw)
  To: 'Eric Biggers'
  Cc: 'Alasdair Kergon', 'Mike Snitzer',
	dm-devel, linux-kernel, ofir.drang, 'Yael Chemla',
	linux-crypto, gilad



> -----Original Message-----
> From: Eric Biggers <ebiggers3@gmail.com>
> Sent: Tuesday, 27 March 2018 9:55
> To: Yael Chemla <yael.chemla@foss.arm.com>
> Cc: Alasdair Kergon <agk@redhat.com>; Mike Snitzer <snitzer@redhat.com>;
> dm-devel@redhat.com; linux-kernel@vger.kernel.org; ofir.drang@gmail.com;
> Yael Chemla <yael.chemla@arm.com>; linux-crypto@vger.kernel.org;
> gilad@benyossef.com
> Subject: Re: [dm-devel] [PATCH 2/2] md: dm-verity: allow parallel processing
> of bio blocks
> 
> [+Cc linux-crypto]
> 
> Hi Yael,
> 
> On Sun, Mar 25, 2018 at 07:41:30PM +0100, Yael Chemla wrote:
> >  Allow parallel processing of bio blocks by moving to async.
> > completion  handling. This allows for better resource utilization of
> > both HW and  software based hash tfm and therefore better performance
> > in many cases,  depending on the specific tfm in use.
> >
> >  Tested on ARM32 (zynq board) and ARM64 (Juno board).
> >  Time of cat command was measured on a filesystem with various file sizes.
> >  12% performance improvement when HW based hash was used (ccree
> driver).
> >  SW based hash showed less than 1% improvement.
> >  CPU utilization when HW based hash was used presented 10% less
> > context  switch, 4% less cycles and 7% less instructions. No
> > difference in  CPU utilization noticed with SW based hash.
> >
> > Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
> 
> Okay, I definitely would like to see dm-verity better support hardware crypto
> accelerators, but these patches were painful to read.
> 
> There are lots of smaller bugs, but the high-level problem which you need to
> address first is that on every bio you are always allocating all the extra
> memory to hold a hash request and scatterlist for every data block.  This will

I have a question regarding scatterlist memory:
I noticed that all blocks in dmverity end up using two buffers: one for data and other for salt. 
I'm using function similar to verity_for_io_block to iterate and find the number of buffers,
in my case data_dev_block_bits =12, todo=4096, thus the do while will iterate only once.
I assume that since it's there there are cases it'll iterate more.
I'm trying to figure out which cases will require more than one buffer of data per block? 
In dm_crypt there is limitation of static 4 scatterlist elements per in/out (see struct dm_crypt_request).
Is there an upper bound regarding number of buffers per block in dm-verity?  
I need this for the implementation of  mempool per scatterlist buffers.
Thanks ,
Yael

> not only hurt performance when the hashing is done in software (I'm
> skeptical that your performance numbers are representative of that case), but
> it will also fall apart under memory pressure.  We are trying to get low-end
> Android devices to start using dm-verity, and such devices often have only 1
> GB or even only 512 MB of RAM, so memory allocations are at increased risk
> of failing.  In fact I'm pretty sure you didn't do any proper stress testing of
> these patches, since the first thing they do for every bio is try to allocate a
> physically contiguous array that is nearly as long as the full bio data itself
> (n_blocks * sizeof(struct dm_verity_req_data) = n_blocks * 3264, at least on a
> 64-bit platform, mostly due to the 'struct dm_verity_fec_io'), so potentially
> up to about 1 MB; that's going to fail a lot even on systems with gigabytes of
> RAM...
> 
> (You also need to verify that your new code is compatible with the forward
> error correction feature, with the "ignore_zero_blocks" option, and with the
> new "check_at_most_once" option.  From my reading of the code, all of
> those seemed broken; the dm_verity_fec_io structures, for example, weren't
> even being
> initialized...)
> 
> I think you need to take a close look at how dm-crypt handles async crypto
> implementations, since it seems to do it properly without hurting the
> common case where the crypto happens synchronously.  What it does, is it
> reserves space in the per-bio data for a single cipher request.  Then, *only* if
> the cipher implementation actually processes the request asynchronously (as
> indicated by -EINPROGRESS being returned) is a new cipher request allocated
> dynamically, using a mempool (not kmalloc, which is prone to fail).  Note that
> unlike your patches it also properly handles the case where the hardware
> crypto queue is full, as indicated by the cipher implementation returning -
> EBUSY; in that case, dm-crypt waits to start another request until there is
> space in the queue.
> 
> I think it would be possible to adapt dm-crypt's solution to dm-verity.
> 
> Thanks,
> 
> Eric
> 
> > ---
> >  drivers/md/dm-verity-fec.c    |  10 +-
> >  drivers/md/dm-verity-fec.h    |   7 +-
> >  drivers/md/dm-verity-target.c | 215 +++++++++++++++++++++++++++++++--
> ---------
> >  drivers/md/dm-verity.h        |   4 +-
> >  4 files changed, 173 insertions(+), 63 deletions(-)
> >
> > diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
> > index e13f908..bcea307 100644
> > --- a/drivers/md/dm-verity-fec.c
> > +++ b/drivers/md/dm-verity-fec.c
> > @@ -203,13 +203,12 @@ static int fec_is_erasure(struct dm_verity *v,
> struct dm_verity_io *io,
> >   */
> >  static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
> >  			 u64 rsb, u64 target, unsigned block_offset,
> > -			 int *neras)
> > +			 int *neras, struct dm_verity_fec_io *fio)
> >  {
> >  	bool is_zero;
> >  	int i, j, target_index = -1;
> >  	struct dm_buffer *buf;
> >  	struct dm_bufio_client *bufio;
> > -	struct dm_verity_fec_io *fio = fec_io(io);
> >  	u64 block, ileaved;
> >  	u8 *bbuf, *rs_block;
> >  	u8 want_digest[v->digest_size];
> > @@ -265,7 +264,7 @@ static int fec_read_bufs(struct dm_verity *v,
> > struct dm_verity_io *io,
> >
> >  		/* locate erasures if the block is on the data device */
> >  		if (bufio == v->fec->data_bufio &&
> > -		    verity_hash_for_block(v, io, block, want_digest,
> > +		    verity_hash_for_block(v, io, block, want_digest, fio,
> >  					  &is_zero) == 0) {
> >  			/* skip known zero blocks entirely */
> >  			if (is_zero)
> > @@ -374,7 +373,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct
> dm_verity_io *io,
> >  		fec_init_bufs(v, fio);
> >
> >  		r = fec_read_bufs(v, io, rsb, offset, pos,
> > -				  use_erasures ? &neras : NULL);
> > +				  use_erasures ? &neras : NULL, fio);
> >  		if (unlikely(r < 0))
> >  			return r;
> >
> > @@ -419,10 +418,9 @@ static int fec_bv_copy(struct dm_verity *v, struct
> dm_verity_io *io, u8 *data,
> >   */
> >  int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
> >  		      enum verity_block_type type, sector_t block, u8 *dest,
> > -		      struct bvec_iter *iter)
> > +		      struct bvec_iter *iter, struct dm_verity_fec_io *fio)
> >  {
> >  	int r;
> > -	struct dm_verity_fec_io *fio = fec_io(io);
> >  	u64 offset, res, rsb;
> >
> >  	if (!verity_fec_is_enabled(v))
> > diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
> > index bb31ce8..46c2f47 100644
> > --- a/drivers/md/dm-verity-fec.h
> > +++ b/drivers/md/dm-verity-fec.h
> > @@ -73,7 +73,9 @@ extern bool verity_fec_is_enabled(struct dm_verity
> > *v);
> >
> >  extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
> >  			     enum verity_block_type type, sector_t block,
> > -			     u8 *dest, struct bvec_iter *iter);
> > +			     u8 *dest, struct bvec_iter *iter,
> > +			     struct dm_verity_fec_io *fio);
> > +
> >
> >  extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
> >  					char *result, unsigned maxlen);
> > @@ -104,7 +106,8 @@ static inline int verity_fec_decode(struct dm_verity
> *v,
> >  				    struct dm_verity_io *io,
> >  				    enum verity_block_type type,
> >  				    sector_t block, u8 *dest,
> > -				    struct bvec_iter *iter)
> > +				    struct bvec_iter *iter,
> > +				    struct dm_verity_fec_io *fio)
> >  {
> >  	return -EOPNOTSUPP;
> >  }
> > diff --git a/drivers/md/dm-verity-target.c
> > b/drivers/md/dm-verity-target.c index a281b83..30512ee 100644
> > --- a/drivers/md/dm-verity-target.c
> > +++ b/drivers/md/dm-verity-target.c
> > @@ -38,7 +38,35 @@
> >  /* only two elements in static scatter list: salt and data */
> >  #define SG_FIXED_ITEMS	2
> >
> > +struct dm_ver_io_data {
> > +	atomic_t expected_reqs;
> > +	atomic_t err;
> > +	int total_reqs;
> > +	struct dm_ver_req_data *reqdata_arr;
> > +	struct dm_verity_io *io;
> > +};
> > +
> > +#define MAX_DIGEST_SIZE 64 /* as big as sha512 digest size */
> > +
> > +struct dm_ver_req_data {
> > +	u8 want_digest[MAX_DIGEST_SIZE];
> > +	u8 real_digest[MAX_DIGEST_SIZE];
> > +	struct dm_verity_fec_io fec_io;
> > +	unsigned int iblock;	// block index in the request
> > +	unsigned int digest_size;
> > +	struct scatterlist *sg;
> > +	struct dm_ver_io_data *iodata;
> > +	/* req field is the last on purpose since it's not fixed in size and
> > +	 *  its size if calucluated using ahash_request_alloc or an addition of
> > +	 *  the required size is done with +crypto_ahash_reqsize(v->tfm)
> > +	 */
> > +	struct ahash_request *req;
> > +};
> > +
> >  static unsigned dm_verity_prefetch_cluster =
> > DM_VERITY_DEFAULT_PREFETCH_SIZE;
> > +static void verity_finish_io(struct dm_verity_io *io, blk_status_t
> > +status); static void verity_release_req(struct dm_ver_io_data
> > +*iodata);
> > +
> >
> >  module_param_named(prefetch_cluster, dm_verity_prefetch_cluster,
> > uint, S_IRUGO | S_IWUSR);
> >
> > @@ -102,7 +130,7 @@ static sector_t verity_position_at_level(struct
> > dm_verity *v, sector_t block,
> >
> >  /*
> >   * verity_is_salt_required - check if according to verity version and
> > - * verity salt's size if there's a need to insert a salt.
> > + * verity salt's size there's a need to insert a salt.
> >   * note: salt goes last for 0th version and first for all others
> >   * @where - START_SG - before buffer / END_SG - after buffer
> >   */
> > @@ -247,7 +275,7 @@ static int verity_handle_err(struct dm_verity *v,
> enum verity_block_type type,
> >   */
> >  static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
> >  			       sector_t block, int level, bool skip_unverified,
> > -			       u8 *want_digest)
> > +			       u8 *want_digest, struct dm_verity_fec_io *fec_io)
> >  {
> >  	struct dm_buffer *buf;
> >  	struct buffer_aux *aux;
> > @@ -281,7 +309,7 @@ static int verity_verify_level(struct dm_verity *v,
> struct dm_verity_io *io,
> >  			aux->hash_verified = 1;
> >  		else if (verity_fec_decode(v, io,
> >
> DM_VERITY_BLOCK_TYPE_METADATA,
> > -					   hash_block, data, NULL) == 0)
> > +					   hash_block, data, NULL, fec_io) == 0)
> >  			aux->hash_verified = 1;
> >  		else if (verity_handle_err(v,
> >
> DM_VERITY_BLOCK_TYPE_METADATA, @@ -305,7 +333,9 @@ static int
> > verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
> >   * of the hash tree if necessary.
> >   */
> >  int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
> > -			  sector_t block, u8 *digest, bool *is_zero)
> > +			  sector_t block, u8 *digest,
> > +			  struct dm_verity_fec_io *fec_io,
> > +			  bool *is_zero)
> >  {
> >  	int r = 0, i;
> >
> > @@ -317,7 +347,7 @@ int verity_hash_for_block(struct dm_verity *v, struct
> dm_verity_io *io,
> >  		 * function returns 1 and we fall back to whole
> >  		 * chain verification.
> >  		 */
> > -		r = verity_verify_level(v, io, block, 0, true, digest);
> > +		r = verity_verify_level(v, io, block, 0, true, digest, fec_io);
> >  		if (likely(r <= 0))
> >  			goto out;
> >  	}
> > @@ -325,7 +355,7 @@ int verity_hash_for_block(struct dm_verity *v, struct
> dm_verity_io *io,
> >  	memcpy(digest, v->root_digest, v->digest_size);
> >
> >  	for (i = v->levels - 1; i >= 0; i--) {
> > -		r = verity_verify_level(v, io, block, i, false, digest);
> > +		r = verity_verify_level(v, io, block, i, false, digest, fec_io);
> >  		if (unlikely(r))
> >  			goto out;
> >  	}
> > @@ -436,39 +466,118 @@ static int verity_bv_zero(struct dm_verity *v,
> struct dm_verity_io *io,
> >  	return 0;
> >  }
> >
> > +
> > +static void verity_cb_complete(struct dm_ver_io_data *iodata, int
> > +err) {
> > +	struct dm_verity_io *io = iodata->io;
> > +
> > +	/* save last error occurred */
> > +	if (err)
> > +		atomic_set(&iodata->err, err);
> > +	if (atomic_dec_and_test(&iodata->expected_reqs)) {
> > +		verity_release_req(iodata);
> > +		verity_finish_io(io, errno_to_blk_status(err));
> > +	}
> > +}
> > +
> > +static void __single_block_req_done(struct dm_ver_req_data *req_data,
> > +					int err, struct dm_verity_io *io) {
> > +	struct dm_verity *v = io->v;
> > +
> > +	if (err == -EINPROGRESS)
> > +		return;
> > +
> > +	WARN_ON((err != 0) || (req_data == NULL) || (req_data->iodata ==
> NULL));
> > +	if ((err != 0) || (req_data == NULL) || (req_data->iodata == NULL))
> > +		goto complete;
> > +
> > +	kfree(req_data->sg);
> > +
> > +	if (likely(memcmp(req_data->real_digest,
> > +			req_data->want_digest, req_data->digest_size) == 0))
> > +		goto complete;
> > +	else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
> > +			io->block + req_data->iblock, NULL, &io->iter,
> > +			&req_data->fec_io) == 0){
> > +		goto complete;
> > +	} else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> > +		req_data->iodata->io->block + req_data->iblock)){
> > +		err = -EIO;
> > +		goto complete;
> > +	}
> > +
> > +complete:
> > +	ahash_request_free(req_data->req);
> > +	verity_cb_complete(req_data->iodata, err); }
> > +
> > +static void single_block_req_done(struct crypto_async_request *req,
> > +int err) {
> > +	struct dm_ver_req_data *req_data = req->data;
> > +
> > +	__single_block_req_done(req_data, err, req_data->iodata->io); }
> > +
> > +static void verity_release_req(struct dm_ver_io_data *iodata) {
> > +	kfree(iodata->reqdata_arr);
> > +	kfree(iodata);
> > +}
> >  /*
> >   * Verify one "dm_verity_io" structure.
> >   */
> > -static int verity_verify_io(struct dm_verity_io *io)
> > +static void verity_verify_io(struct dm_verity_io *io)
> >  {
> >  	bool is_zero;
> >  	struct dm_verity *v = io->v;
> > -	struct bvec_iter start;
> > -	unsigned b;
> > -	struct crypto_wait wait;
> > -	struct scatterlist *sg;
> > -	int r;
> > +	unsigned int b = 0, blocks = 0;
> > +	struct dm_ver_io_data *iodata = NULL;
> > +	struct dm_ver_req_data *reqdata_arr = NULL;
> > +	struct scatterlist *sg = NULL;
> > +	int res;
> > +
> > +	iodata = kmalloc(sizeof(*iodata), GFP_KERNEL);
> > +	reqdata_arr = kmalloc_array(io->n_blocks,
> > +			sizeof(struct dm_ver_req_data), GFP_KERNEL);
> > +	if (unlikely((iodata == NULL) || (reqdata_arr == NULL))) {
> > +		WARN_ON((iodata == NULL) || (reqdata_arr == NULL));
> > +		goto err_memfree;
> > +	}
> > +	atomic_set(&iodata->expected_reqs, io->n_blocks);
> > +	iodata->reqdata_arr = reqdata_arr;
> > +	iodata->io = io;
> > +	iodata->total_reqs = blocks = io->n_blocks;
> > +
> >
> > -	for (b = 0; b < io->n_blocks; b++) {
> > +	for (b = 0; b < blocks; b++) {
> >  		unsigned int nents;
> >  		unsigned int total_len = 0;
> >  		unsigned int num_of_buffs = 0;
> > -		struct ahash_request *req = verity_io_hash_req(v, io);
> >
> > -		/* an extra one for the salt buffer */
> > +		reqdata_arr[b].req = ahash_request_alloc(v->tfm,
> GFP_KERNEL);
> > +
> > +		if (unlikely(reqdata_arr[b].req == NULL))
> > +			goto err_memfree;
> > +		/* +1 for the salt buffer */
> > +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
> >  		num_of_buffs = verity_calc_buffs_for_bv(v, io, io->iter) + 1;
> >  		WARN_ON(num_of_buffs < 1);
> >
> >  		sg = kmalloc_array(num_of_buffs, sizeof(struct scatterlist),
> >  				     GFP_KERNEL);
> > -		if (!sg)
> > -			return -ENOMEM;
> > +		if (!sg) {
> > +			DMERR("%s kmalloc_array failed", __func__);
> > +			goto err_memfree;
> > +		}
> > +
> >  		sg_init_table(sg, num_of_buffs);
> >
> > -		r = verity_hash_for_block(v, io, io->block + b,
> > -					  verity_io_want_digest(v, io),
> > -					  &is_zero);
> > -		if (unlikely(r < 0))
> > +		res = verity_hash_for_block(v, io, io->block + b,
> > +					    reqdata_arr[b].want_digest,
> > +					    &reqdata_arr[b].fec_io,
> > +					    &is_zero);
> > +		if (unlikely(res < 0))
> >  			goto err_memfree;
> >
> >  		if (is_zero) {
> > @@ -476,56 +585,54 @@ static int verity_verify_io(struct dm_verity_io *io)
> >  			 * If we expect a zero block, don't validate, just
> >  			 * return zeros.
> >  			 */
> > -			r = verity_for_bv_block(v, io, &io->iter,
> > +			res = verity_for_bv_block(v, io, &io->iter,
> >  						verity_bv_zero);
> > -			if (unlikely(r < 0))
> > +			if (unlikely(res < 0))
> >  				goto err_memfree;
> > -
> > +			verity_cb_complete(reqdata_arr[b].iodata, res);
> >  			continue;
> >  		}
> >
> > -		ahash_request_set_tfm(req, v->tfm);
> > -		ahash_request_set_callback(req,
> CRYPTO_TFM_REQ_MAY_SLEEP |
> > -					CRYPTO_TFM_REQ_MAY_BACKLOG,
> > -					crypto_req_done, (void *)&wait);
> >  		nents = 0;
> >  		total_len = 0;
> >  		if (verity_is_salt_required(v, START_SG))
> >  			verity_add_salt(v, sg, &nents, &total_len);
> >
> > -		start = io->iter;
> > -		verity_for_io_block(v, io, &io->iter, sg, &nents, &total_len);
> > +		verity_for_io_block(v, io, &io->iter, sg, &nents,
> > +				&total_len);
> >  		if (verity_is_salt_required(v, END_SG))
> >  			verity_add_salt(v, sg, &nents, &total_len);
> > -		/*
> > -		 * need to mark end of chain, since we might have allocated
> > -		 * more than we actually use
> > +		reqdata_arr[b].iodata = iodata;
> > +		reqdata_arr[b].sg = sg;
> > +		reqdata_arr[b].digest_size = v->digest_size;
> > +		reqdata_arr[b].iblock = b;
> > +		/* need to mark end of chain,
> > +		 * since we might have allocated more than we actually use
> >  		 */
> >  		sg_mark_end(&sg[nents-1]);
> > -		ahash_request_set_crypt(req, sg, verity_io_real_digest(v, io),
> > -				total_len);
> > -		crypto_init_wait(&wait);
> > -		r = crypto_wait_req(crypto_ahash_digest(req), &wait);
> >
> > -		if (unlikely(r < 0))
> > -			goto err_memfree;
> > -		kfree(sg);
> > -		if (likely(memcmp(verity_io_real_digest(v, io),
> > -				  verity_io_want_digest(v, io), v->digest_size)
> == 0))
> > -			continue;
> > -		else if (verity_fec_decode(v, io,
> DM_VERITY_BLOCK_TYPE_DATA,
> > -					   io->block + b, NULL, &start) == 0)
> > -			continue;
> > -		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
> > -					   io->block + b))
> > -			return -EIO;
> > -	}
> > +		ahash_request_set_tfm(reqdata_arr[b].req, v->tfm);
> > +		ahash_request_set_callback(reqdata_arr[b].req,
> > +			CRYPTO_TFM_REQ_MAY_SLEEP |
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> > +			single_block_req_done, (void *)&reqdata_arr[b]);
> >
> > -	return 0;
> > +		ahash_request_set_crypt(reqdata_arr[b].req, sg,
> > +			reqdata_arr[b].real_digest, total_len);
> > +		res = crypto_ahash_digest(reqdata_arr[b].req);
> > +		/* digest completed already, callback won't be called. */
> > +		if (res == 0)
> > +			__single_block_req_done(&reqdata_arr[b], res, io);
> > +
> > +	}
> > +	return;
> >
> >  err_memfree:
> > -	kfree(sg);
> > -	return r;
> > +	DMERR("%s: error occurred", __func__);
> > +	/* reduce expected requests by the number of
> > +	 * unsent requests, -1 accounting for the current block
> > +	 */
> > +	atomic_sub(blocks-b-1, &iodata->expected_reqs);
> > +	verity_cb_complete(iodata, -EIO);
> >  }
> >
> >  /*
> > @@ -548,7 +655,7 @@ static void verity_work(struct work_struct *w)  {
> >  	struct dm_verity_io *io = container_of(w, struct dm_verity_io,
> > work);
> >
> > -	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
> > +	verity_verify_io(io);
> >  }
> >
> >  static void verity_end_io(struct bio *bio) diff --git
> > a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index
> > b675bc0..0e407f6 100644
> > --- a/drivers/md/dm-verity.h
> > +++ b/drivers/md/dm-verity.h
> > @@ -30,6 +30,7 @@ enum verity_block_type {  };
> >
> >  struct dm_verity_fec;
> > +struct dm_verity_fec_io;
> >
> >  struct dm_verity {
> >  	struct dm_dev *data_dev;
> > @@ -124,6 +125,7 @@ extern int verity_hash(struct dm_verity *v, struct
> ahash_request *req,
> >  		       const u8 *data, size_t len, u8 *digest);
> >
> >  extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io
> *io,
> > -				 sector_t block, u8 *digest, bool *is_zero);
> > +				 sector_t block, u8 *digest,
> > +				 struct dm_verity_fec_io *fio, bool *is_zero);
> >
> >  #endif /* DM_VERITY_H */
> > --
> > 2.7.4
> >
> > --
> > dm-devel mailing list
> > dm-devel@redhat.com
> > https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-04-25 15:13 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-25 18:41 [PATCH 1/2] md: dm-verity: aggregate crypto API calls Yael Chemla
2018-03-25 18:41 ` [PATCH 2/2] md: dm-verity: allow parallel processing of bio blocks Yael Chemla
2018-03-26  6:31   ` Gilad Ben-Yossef
2018-03-27  1:06   ` Mike Snitzer
2018-03-27  6:52     ` Gilad Ben-Yossef
2018-03-27  8:55     ` yael.chemla
2018-03-27 13:16       ` Mike Snitzer
2018-03-27 21:27         ` yael.chemla
2018-03-27  6:55   ` [dm-devel] " Eric Biggers
2018-03-27  8:05     ` Milan Broz
2018-03-27  9:09       ` yael.chemla
2018-03-27  8:50     ` yael.chemla
2018-04-25 15:13     ` yael.chemla
2018-03-26  6:31 ` [PATCH 1/2] md: dm-verity: aggregate crypto API calls Gilad Ben-Yossef

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).