Netdev Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH rdma-next 0/5] mlx5 MR cache enhancements
@ 2021-06-22 12:08 Leon Romanovsky
  2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-rdma, Michael S. Tsirkin, netdev, Saeed Mahameed,
	Shay Drory, virtualization

From: Leon Romanovsky <leonro@nvidia.com>

Hi,

This series from Aharon changes how memory region (MR) cache logic
works in mlx5 driver. First 3 patches are global conversions of
internal structures to hold mkeys, while the rest of the patches
changes cache logic in the RDMA subsystem.

Thanks

Aharon Landau (5):
  RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key
  RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib
  RDMA/mlx5: Change the cache to hold mkeys instead of MRs
  RDMA/mlx5: Change the cache structure to an rbtree
  RDMA/mlx5: Delay the deregistration of a non-cache mkey

 drivers/infiniband/hw/mlx5/devx.c             |   2 +-
 drivers/infiniband/hw/mlx5/main.c             |   4 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h          |  99 ++-
 drivers/infiniband/hw/mlx5/mr.c               | 795 ++++++++++++------
 drivers/infiniband/hw/mlx5/odp.c              |  56 +-
 .../mellanox/mlx5/core/diag/fw_tracer.c       |   2 +-
 .../mellanox/mlx5/core/diag/fw_tracer.h       |   2 +-
 .../mellanox/mlx5/core/diag/rsc_dump.c        |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/ptp.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/trap.c |   2 +-
 .../ethernet/mellanox/mlx5/core/en_common.c   |   3 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  11 +-
 .../ethernet/mellanox/mlx5/core/fpga/conn.c   |   8 +-
 .../ethernet/mellanox/mlx5/core/fpga/core.h   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/mr.c  |  26 +-
 .../mellanox/mlx5/core/steering/dr_icm_pool.c |   9 +-
 .../mellanox/mlx5/core/steering/dr_send.c     |   9 +-
 .../mellanox/mlx5/core/steering/dr_types.h    |   2 +-
 drivers/vdpa/mlx5/core/mlx5_vdpa.h            |   8 +-
 drivers/vdpa/mlx5/core/mr.c                   |   6 +-
 drivers/vdpa/mlx5/core/resources.c            |  13 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c             |   2 +-
 include/linux/mlx5/driver.h                   |  30 +-
 24 files changed, 697 insertions(+), 404 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key
  2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
@ 2021-06-22 12:08 ` Leon Romanovsky
  2021-07-29 15:28   ` Jason Gunthorpe
  2021-07-29 18:08   ` Jason Gunthorpe
  2021-06-22 12:08 ` [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib Leon Romanovsky
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Aharon Landau, Jakub Kicinski, Jason Wang, linux-kernel,
	linux-rdma, Michael S. Tsirkin, netdev, Saeed Mahameed,
	Shay Drory, virtualization

From: Aharon Landau <aharonl@nvidia.com>

In mlx5_core and vdpa there is no use of mlx5_core_mkey members except for
the key itself. As preparation for moving mlx5_core_mkey to mlx5_ib, the
occurrences of struct mlx5_core_mkey in all modules except for mlx5_ib are
replaced by a u32 key.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mr.c               | 42 +++++++++++++------
 drivers/infiniband/hw/mlx5/odp.c              |  2 +-
 .../mellanox/mlx5/core/diag/fw_tracer.c       |  2 +-
 .../mellanox/mlx5/core/diag/fw_tracer.h       |  2 +-
 .../mellanox/mlx5/core/diag/rsc_dump.c        |  6 +--
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en/ptp.c  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en/trap.c |  2 +-
 .../ethernet/mellanox/mlx5/core/en_common.c   |  3 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++--
 .../ethernet/mellanox/mlx5/core/fpga/conn.c   |  8 ++--
 .../ethernet/mellanox/mlx5/core/fpga/core.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/mr.c  | 26 ++++--------
 .../mellanox/mlx5/core/steering/dr_icm_pool.c |  9 ++--
 .../mellanox/mlx5/core/steering/dr_send.c     |  9 ++--
 .../mellanox/mlx5/core/steering/dr_types.h    |  2 +-
 drivers/vdpa/mlx5/core/mlx5_vdpa.h            |  8 ++--
 drivers/vdpa/mlx5/core/mr.c                   |  6 +--
 drivers/vdpa/mlx5/core/resources.c            | 13 ++----
 drivers/vdpa/mlx5/net/mlx5_vnet.c             |  2 +-
 include/linux/mlx5/driver.h                   | 14 +++----
 21 files changed, 85 insertions(+), 88 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 03dc6c22843f..ae0472d92801 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -89,24 +89,39 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
 }
 
-static void
-assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
-		    u32 *in)
+static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
 {
 	u8 key = atomic_inc_return(&dev->mkey_var);
 	void *mkc;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, mkey_7_0, key);
-	mkey->key = key;
+	*mkey = key;
+}
+
+static void set_mkey_fields(void *mkc, struct mlx5_core_mkey *mkey)
+{
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	init_waitqueue_head(&mkey->wait);
 }
 
 static int
 mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
 		    u32 *in, int inlen)
 {
-	assign_mkey_variant(dev, mkey, in);
-	return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
+	int err;
+	void *mkc;
+
+	assign_mkey_variant(dev, &mkey->key, in);
+	err = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
+	if (err)
+		return err;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_mkey_fields(mkc, mkey);
+	return 0;
 }
 
 static int
@@ -117,7 +132,7 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
 		       struct mlx5_async_work *context)
 {
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
-	assign_mkey_variant(dev, mkey, in);
+	assign_mkey_variant(dev, &mkey->key, in);
 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
 				create_mkey_callback, context);
 }
@@ -134,7 +149,7 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 
-	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey.key);
 }
 
 static void create_mkey_callback(int status, struct mlx5_async_work *context)
@@ -261,9 +276,10 @@ static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
 		goto free_in;
 	}
 
-	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
+	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
 	if (err)
 		goto free_mr;
+	set_mkey_fields(mkc, &mr->mmkey);
 
 	mr->mmkey.type = MLX5_MKEY_MR;
 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
@@ -291,7 +307,7 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 	ent->available_mrs--;
 	ent->total_mrs--;
 	spin_unlock_irq(&ent->lock);
-	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
+	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey.key);
 	kfree(mr);
 	spin_lock_irq(&ent->lock);
 }
@@ -659,7 +675,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 		ent->available_mrs--;
 		ent->total_mrs--;
 		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey.key);
 	}
 
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
@@ -2342,7 +2358,7 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 	return 0;
 
 free_mkey:
-	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
+	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey.key);
 free:
 	kfree(in);
 	return err;
@@ -2361,7 +2377,7 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw)
 		 */
 		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
 
-	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
+	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey.key);
 }
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 74dbbf968405..b9f06c4d40ca 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -909,7 +909,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 		pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
 						       bsf0_klm0_pas_mtt0_1);
 
-		ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
+		ret = mlx5_core_query_mkey(dev->mdev, &mmkey->key, out, outlen);
 		if (ret)
 			goto end;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
index 01a1d02dcf15..e95f6003abb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
@@ -745,7 +745,7 @@ static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer)
 	MLX5_SET(mtrc_conf, in, trace_mode, TRACE_TO_MEMORY);
 	MLX5_SET(mtrc_conf, in, log_trace_buffer_size,
 		 ilog2(TRACER_BUFFER_PAGE_NUM));
-	MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey.key);
+	MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey);
 
 	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
 				   MLX5_REG_MTRC_CONF, 0, 1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
index 97252a85d65e..4762b55b0b0e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
@@ -89,7 +89,7 @@ struct mlx5_fw_tracer {
 		void *log_buf;
 		dma_addr_t dma;
 		u32 size;
-		struct mlx5_core_mkey mkey;
+		u32 mkey;
 		u32 consumer_index;
 	} buff;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
index ed4fb79b4db7..269d685e194f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
@@ -30,7 +30,7 @@ static const char *const mlx5_rsc_sgmt_name[] = {
 
 struct mlx5_rsc_dump {
 	u32 pdn;
-	struct mlx5_core_mkey mkey;
+	u32 mkey;
 	u16 fw_segment_type[MLX5_SGMT_TYPE_NUM];
 };
 
@@ -89,7 +89,7 @@ static int mlx5_rsc_dump_trigger(struct mlx5_core_dev *dev, struct mlx5_rsc_dump
 		return -ENOMEM;
 
 	in_seq_num = MLX5_GET(resource_dump, cmd->cmd, seq_num);
-	MLX5_SET(resource_dump, cmd->cmd, mkey, rsc_dump->mkey.key);
+	MLX5_SET(resource_dump, cmd->cmd, mkey, rsc_dump->mkey);
 	MLX5_SET64(resource_dump, cmd->cmd, address, dma);
 
 	err = mlx5_core_access_reg(dev, cmd->cmd, sizeof(cmd->cmd), cmd->cmd,
@@ -202,7 +202,7 @@ static int mlx5_rsc_dump_menu(struct mlx5_core_dev *dev)
 }
 
 static int mlx5_rsc_dump_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
-				     struct mlx5_core_mkey *mkey)
+				     u32 *mkey)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index b636d63358d2..0420a90f84cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -658,7 +658,7 @@ struct mlx5e_rq {
 	u8                     wq_type;
 	u32                    rqn;
 	struct mlx5_core_dev  *mdev;
-	struct mlx5_core_mkey  umr_mkey;
+	u32		       umr_mkey;
 	struct mlx5e_dma_info  wqe_overflow;
 
 	/* XDP read-mostly */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index d907c1acd4d5..fefe66189800 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -677,7 +677,7 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
 	c->tstamp   = &priv->tstamp;
 	c->pdev     = mlx5_core_dma_dev(priv->mdev);
 	c->netdev   = priv->netdev;
-	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey);
 	c->num_tc   = params->num_tc;
 	c->stats    = &priv->ptp_stats.ch;
 	c->lag_port = lag_port;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
index 86ab4e864fe6..f8cd9d579556 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
@@ -148,7 +148,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
 	t->tstamp   = &priv->tstamp;
 	t->pdev     = mlx5_core_dma_dev(priv->mdev);
 	t->netdev   = priv->netdev;
-	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
+	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey);
 	t->stats    = &priv->trap_stats.ch;
 
 	netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 8c166ee56d8b..bb52ca5e0d0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -73,8 +73,7 @@ void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
 	MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_pci_enable && ro_write);
 }
 
-static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
-			     struct mlx5_core_mkey *mkey)
+static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bca832cdc4cb..153b7a081c8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -229,9 +229,8 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node)
 	return 0;
 }
 
-static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
-				 u64 npages, u8 page_shift,
-				 struct mlx5_core_mkey *umr_mkey,
+static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, u64 npages,
+				 u8 page_shift, u32 *umr_mkey,
 				 dma_addr_t filler_addr)
 {
 	struct mlx5_mtt *mtt;
@@ -451,7 +450,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		err = mlx5e_create_rq_umr_mkey(mdev, rq);
 		if (err)
 			goto err_rq_drop_page;
-		rq->mkey_be = cpu_to_be32(rq->umr_mkey.key);
+		rq->mkey_be = cpu_to_be32(rq->umr_mkey);
 
 		err = mlx5e_rq_alloc_mpwqe_info(rq, node);
 		if (err)
@@ -483,7 +482,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		if (err)
 			goto err_rq_frags;
 
-		rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey.key);
+		rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey);
 	}
 
 	if (xsk) {
@@ -1993,7 +1992,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->cpu      = cpu;
 	c->pdev     = mlx5_core_dma_dev(priv->mdev);
 	c->netdev   = priv->netdev;
-	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey);
 	c->num_tc   = params->num_tc;
 	c->xdp      = !!params->xdp_prog;
 	c->stats    = &priv->channel_stats[ix].ch;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index bd66ab2af5b5..6f78716ff321 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -115,7 +115,7 @@ static int mlx5_fpga_conn_post_recv(struct mlx5_fpga_conn *conn,
 	ix = conn->qp.rq.pc & (conn->qp.rq.size - 1);
 	data = mlx5_wq_cyc_get_wqe(&conn->qp.wq.rq, ix);
 	data->byte_count = cpu_to_be32(buf->sg[0].size);
-	data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+	data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey);
 	data->addr = cpu_to_be64(buf->sg[0].dma_addr);
 
 	conn->qp.rq.pc++;
@@ -155,7 +155,7 @@ static void mlx5_fpga_conn_post_send(struct mlx5_fpga_conn *conn,
 		if (!buf->sg[sgi].data)
 			break;
 		data->byte_count = cpu_to_be32(buf->sg[sgi].size);
-		data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+		data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey);
 		data->addr = cpu_to_be64(buf->sg[sgi].dma_addr);
 		data++;
 		size++;
@@ -221,7 +221,7 @@ static int mlx5_fpga_conn_post_recv_buf(struct mlx5_fpga_conn *conn)
 }
 
 static int mlx5_fpga_conn_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
-				      struct mlx5_core_mkey *mkey)
+				      u32 *mkey)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
@@ -980,7 +980,7 @@ int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev)
 		mlx5_fpga_err(fdev, "create mkey failed, %d\n", err);
 		goto err_dealloc_pd;
 	}
-	mlx5_fpga_dbg(fdev, "Created mkey 0x%x\n", fdev->conn_res.mkey.key);
+	mlx5_fpga_dbg(fdev, "Created mkey 0x%x\n", fdev->conn_res.mkey);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
index 52c9dee91ea4..2a984e82ae16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
@@ -54,7 +54,7 @@ struct mlx5_fpga_device {
 	/* QP Connection resources */
 	struct {
 		u32 pdn;
-		struct mlx5_core_mkey mkey;
+		u32 mkey;
 		struct mlx5_uars_page *uar;
 	} conn_res;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 50af84e76fb6..7a76b5eb1c1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -35,13 +35,11 @@
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
 
-int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
-			  struct mlx5_core_mkey *mkey,
-			  u32 *in, int inlen)
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
+			  int inlen)
 {
 	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
 	u32 mkey_index;
-	void *mkc;
 	int err;
 
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
@@ -50,38 +48,32 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
 	if (err)
 		return err;
 
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
-	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
-	mkey->size = MLX5_GET64(mkc, mkc, len);
-	mkey->key |= mlx5_idx_to_mkey(mkey_index);
-	mkey->pd = MLX5_GET(mkc, mkc, pd);
-	init_waitqueue_head(&mkey->wait);
+	*mkey |= mlx5_idx_to_mkey(mkey_index);
 
-	mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, mkey->key);
+	mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, *mkey);
 	return 0;
 }
 EXPORT_SYMBOL(mlx5_core_create_mkey);
 
-int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
-			   struct mlx5_core_mkey *mkey)
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 *mkey)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {};
 
 	MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
-	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(*mkey));
 	return mlx5_cmd_exec_in(dev, destroy_mkey, in);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_mkey);
 
-int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
-			 u32 *out, int outlen)
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *out,
+			 int outlen)
 {
 	u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {};
 
 	memset(out, 0, outlen);
 	MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY);
-	MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(*mkey));
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
 }
 EXPORT_SYMBOL(mlx5_core_query_mkey);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c
index 66c24767e3b0..942ff571dfb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c
@@ -24,16 +24,15 @@ struct mlx5dr_icm_dm {
 };
 
 struct mlx5dr_icm_mr {
-	struct mlx5_core_mkey mkey;
+	u32 mkey;
 	struct mlx5dr_icm_dm dm;
 	struct mlx5dr_domain *dmn;
 	size_t length;
 	u64 icm_start_addr;
 };
 
-static int dr_icm_create_dm_mkey(struct mlx5_core_dev *mdev,
-				 u32 pd, u64 length, u64 start_addr, int mode,
-				 struct mlx5_core_mkey *mkey)
+static int dr_icm_create_dm_mkey(struct mlx5_core_dev *mdev, u32 pd, u64 length,
+				 u64 start_addr, int mode, u32 *mkey)
 {
 	u32 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
@@ -252,7 +251,7 @@ dr_icm_chunk_create(struct mlx5dr_icm_pool *pool,
 
 	offset = mlx5dr_icm_pool_dm_type_to_entry_size(pool->icm_type) * seg;
 
-	chunk->rkey = buddy_mem_pool->icm_mr->mkey.key;
+	chunk->rkey = buddy_mem_pool->icm_mr->mkey;
 	chunk->mr_addr = offset;
 	chunk->icm_addr =
 		(uintptr_t)buddy_mem_pool->icm_mr->icm_start_addr + offset;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 12cf323a5943..d1300b16d054 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -346,7 +346,7 @@ static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
 	send_info->read.length = send_info->write.length;
 	/* Read into the same write area */
 	send_info->read.addr = (uintptr_t)send_info->write.addr;
-	send_info->read.lkey = send_ring->mr->mkey.key;
+	send_info->read.lkey = send_ring->mr->mkey;
 
 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
 		send_info->read.send_flags = IB_SEND_SIGNALED;
@@ -376,7 +376,7 @@ static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
 		       (void *)(uintptr_t)send_info->write.addr,
 		       send_info->write.length);
 		send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
-		send_info->write.lkey = send_ring->mr->mkey.key;
+		send_info->write.lkey = send_ring->mr->mkey;
 	}
 
 	send_ring->tx_head++;
@@ -837,8 +837,7 @@ static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
 	kfree(cq);
 }
 
-static int
-dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
+static int dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey)
 {
 	u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
 	void *mkc;
@@ -1028,7 +1027,7 @@ int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
 	send_info.write.lkey = 0;
 	/* Using the sync_mr in order to write/read */
 	send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
-	send_info.rkey = send_ring->sync_mr->mkey.key;
+	send_info.rkey = send_ring->sync_mr->mkey;
 
 	for (i = 0; i < num_of_sends_req; i++) {
 		ret = dr_postsend_icm_data(dmn, &send_info);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 67460c42a99b..8eeac0c3fde0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -1229,7 +1229,7 @@ struct mlx5dr_cq {
 
 struct mlx5dr_mr {
 	struct mlx5_core_dev *mdev;
-	struct mlx5_core_mkey mkey;
+	u32 mkey;
 	dma_addr_t dma_addr;
 	void *addr;
 	size_t size;
diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index b6cc53ba980c..d45967c980b9 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -15,7 +15,7 @@ struct mlx5_vdpa_direct_mr {
 	u64 start;
 	u64 end;
 	u32 perm;
-	struct mlx5_core_mkey mr;
+	u32 mkey;
 	struct sg_table sg_head;
 	int log_size;
 	int nsg;
@@ -25,7 +25,7 @@ struct mlx5_vdpa_direct_mr {
 };
 
 struct mlx5_vdpa_mr {
-	struct mlx5_core_mkey mkey;
+	u32 mkey;
 
 	/* list of direct MRs descendants of this indirect mr */
 	struct list_head head;
@@ -73,9 +73,9 @@ int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn);
 void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn);
 int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev);
-int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey, u32 *in,
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in,
 			  int inlen);
-int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey);
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey);
 int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
 			     bool *change_map);
 int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb);
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 800cfd1967ad..0354c03961af 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -75,7 +75,7 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct
 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
 		 get_octo_len(mr->end - mr->start, mr->log_size));
 	populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
-	err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
+	err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen);
 	kvfree(in);
 	if (err) {
 		mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
@@ -87,7 +87,7 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct
 
 static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
 {
-	mlx5_vdpa_destroy_mkey(mvdev, &mr->mr);
+	mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey);
 }
 
 static u64 map_start(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
@@ -161,7 +161,7 @@ static void fill_indir(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey, v
 		}
 
 		if (preve == dmr->start) {
-			klm->key = cpu_to_be32(dmr->mr.key);
+			klm->key = cpu_to_be32(dmr->mkey);
 			klm->bcount = cpu_to_be32(klm_bcount(dmr->end - dmr->start));
 			preve = dmr->end;
 		} else {
diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c
index 6521cbd0f5c2..8d22b2c0e90c 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -181,12 +181,11 @@ void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn)
 	mlx5_cmd_exec_in(mvdev->mdev, dealloc_transport_domain, in);
 }
 
-int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey, u32 *in,
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in,
 			  int inlen)
 {
 	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
 	u32 mkey_index;
-	void *mkc;
 	int err;
 
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
@@ -196,22 +195,18 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mk
 	if (err)
 		return err;
 
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
-	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
-	mkey->size = MLX5_GET64(mkc, mkc, len);
-	mkey->key |= mlx5_idx_to_mkey(mkey_index);
-	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	*mkey |= mlx5_idx_to_mkey(mkey_index);
 	return 0;
 }
 
-int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey)
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {};
 
 	MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid);
 	MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
-	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(*mkey));
 	return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in);
 }
 
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 189e4385df40..a0352511837f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -824,7 +824,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
-	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
+	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f8e8d7e90616..cc60605c5531 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -648,7 +648,7 @@ struct mlx5e_resources {
 	struct mlx5e_hw_objs {
 		u32                        pdn;
 		struct mlx5_td             td;
-		struct mlx5_core_mkey      mkey;
+		u32                        mkey;
 		struct mlx5_sq_bfreg       bfreg;
 	} hw_objs;
 	struct devlink_port dl_port;
@@ -994,13 +994,11 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 				 struct mlx5_cmd_mailbox *head);
-int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
-			  struct mlx5_core_mkey *mkey,
-			  u32 *in, int inlen);
-int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
-			   struct mlx5_core_mkey *mkey);
-int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
-			 u32 *out, int outlen);
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
+			  int inlen);
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 *mkey);
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *out,
+			 int outlen);
 int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
 int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
 int mlx5_pagealloc_init(struct mlx5_core_dev *dev);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib
  2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
  2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
@ 2021-06-22 12:08 ` Leon Romanovsky
  2021-07-29 18:39   ` Jason Gunthorpe
  2021-06-22 12:08 ` [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs Leon Romanovsky
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Aharon Landau, Jakub Kicinski, Jason Wang, linux-kernel,
	linux-rdma, Michael S. Tsirkin, netdev, Saeed Mahameed,
	Shay Drory, virtualization

From: Aharon Landau <aharonl@nvidia.com>

Moving mlx5_core_mkey struct to mlx5_ib as the mlx5_core doesn't use it
at this point.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/devx.c    |  2 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 22 ++++++++++++++++------
 drivers/infiniband/hw/mlx5/mr.c      | 18 ++++++++----------
 drivers/infiniband/hw/mlx5/odp.c     |  8 ++++----
 include/linux/mlx5/driver.h          | 10 ----------
 5 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 3678b0a8710b..2562462a1df1 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1296,7 +1296,7 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 				     void *in, void *out)
 {
 	struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
-	struct mlx5_core_mkey *mkey;
+	struct mlx5r_mkey *mkey;
 	void *mkc;
 	u8 key;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7bb35a3d8004..af11a0d8ebc0 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -634,9 +634,19 @@ struct mlx5_user_mmap_entry {
 #define mlx5_update_odp_stats(mr, counter_name, value)		\
 	atomic64_add(value, &((mr)->odp_stats.counter_name))
 
+struct mlx5r_mkey {
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			type;
+	struct wait_queue_head wait;
+	refcount_t usecount;
+};
+
 struct mlx5_ib_mr {
 	struct ib_mr ibmr;
-	struct mlx5_core_mkey mmkey;
+	struct mlx5r_mkey mmkey;
 
 	/* User MR data */
 	struct mlx5_cache_ent *cache_ent;
@@ -712,12 +722,12 @@ static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr)
 
 struct mlx5_ib_mw {
 	struct ib_mw		ibmw;
-	struct mlx5_core_mkey	mmkey;
+	struct mlx5r_mkey	mmkey;
 	int			ndescs;
 };
 
 struct mlx5_ib_devx_mr {
-	struct mlx5_core_mkey	mmkey;
+	struct mlx5r_mkey	mmkey;
 	int			ndescs;
 };
 
@@ -1581,7 +1591,7 @@ static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev,
 }
 
 static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev,
-				       struct mlx5_core_mkey *mmkey)
+				       struct mlx5r_mkey *mmkey)
 {
 	refcount_set(&mmkey->usecount, 1);
 
@@ -1590,14 +1600,14 @@ static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev,
 }
 
 /* deref an mkey that can participate in ODP flow */
-static inline void mlx5r_deref_odp_mkey(struct mlx5_core_mkey *mmkey)
+static inline void mlx5r_deref_odp_mkey(struct mlx5r_mkey *mmkey)
 {
 	if (refcount_dec_and_test(&mmkey->usecount))
 		wake_up(&mmkey->wait);
 }
 
 /* deref an mkey that can participate in ODP flow and wait for relese */
-static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_core_mkey *mmkey)
+static inline void mlx5r_deref_wait_odp_mkey(struct mlx5r_mkey *mmkey)
 {
 	mlx5r_deref_odp_mkey(mmkey);
 	wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index ae0472d92801..bb59ea9b0498 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -99,7 +99,7 @@ static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
 	*mkey = key;
 }
 
-static void set_mkey_fields(void *mkc, struct mlx5_core_mkey *mkey)
+static void set_mkey_fields(void *mkc, struct mlx5r_mkey *mkey)
 {
 	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
 	mkey->size = MLX5_GET64(mkc, mkc, len);
@@ -107,9 +107,8 @@ static void set_mkey_fields(void *mkc, struct mlx5_core_mkey *mkey)
 	init_waitqueue_head(&mkey->wait);
 }
 
-static int
-mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
-		    u32 *in, int inlen)
+static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5r_mkey *mkey,
+			       u32 *in, int inlen)
 {
 	int err;
 	void *mkc;
@@ -124,12 +123,11 @@ mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
 	return 0;
 }
 
-static int
-mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
-		       struct mlx5_core_mkey *mkey,
-		       struct mlx5_async_ctx *async_ctx,
-		       u32 *in, int inlen, u32 *out, int outlen,
-		       struct mlx5_async_work *context)
+static int mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
+				  struct mlx5r_mkey *mkey,
+				  struct mlx5_async_ctx *async_ctx, u32 *in,
+				  int inlen, u32 *out, int outlen,
+				  struct mlx5_async_work *context)
 {
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
 	assign_mkey_variant(dev, &mkey->key, in);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index b9f06c4d40ca..bc35900c6955 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -788,7 +788,7 @@ struct pf_frame {
 	int depth;
 };
 
-static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
+static bool mkey_is_eq(struct mlx5r_mkey *mmkey, u32 key)
 {
 	if (!mmkey)
 		return false;
@@ -797,7 +797,7 @@ static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
 	return mmkey->key == key;
 }
 
-static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
+static int get_indirect_num_descs(struct mlx5r_mkey *mmkey)
 {
 	struct mlx5_ib_mw *mw;
 	struct mlx5_ib_devx_mr *devx_mr;
@@ -831,7 +831,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 {
 	int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
 	struct pf_frame *head = NULL, *frame;
-	struct mlx5_core_mkey *mmkey;
+	struct mlx5r_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
 	struct mlx5_klm *pklm;
 	u32 *out = NULL;
@@ -1699,7 +1699,7 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
 		    u32 lkey)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_core_mkey *mmkey;
+	struct mlx5r_mkey *mmkey;
 	struct mlx5_ib_mr *mr = NULL;
 
 	xa_lock(&dev->odp_mkeys);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cc60605c5531..5832d6614606 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -363,16 +363,6 @@ enum {
 	MLX5_MKEY_INDIRECT_DEVX,
 };
 
-struct mlx5_core_mkey {
-	u64			iova;
-	u64			size;
-	u32			key;
-	u32			pd;
-	u32			type;
-	struct wait_queue_head wait;
-	refcount_t usecount;
-};
-
 #define MLX5_24BIT_MASK		((1 << 24) - 1)
 
 enum mlx5_res_type {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs
  2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
  2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
  2021-06-22 12:08 ` [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib Leon Romanovsky
@ 2021-06-22 12:08 ` Leon Romanovsky
  2021-07-29 19:08   ` Jason Gunthorpe
  2021-06-22 12:08 ` [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree Leon Romanovsky
  2021-06-22 12:08 ` [PATCH rdma-next 5/5] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky
  4 siblings, 1 reply; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Aharon Landau, Jakub Kicinski, Jason Wang, linux-kernel,
	linux-rdma, Michael S. Tsirkin, netdev, Saeed Mahameed,
	Shay Drory, virtualization

From: Aharon Landau <aharonl@nvidia.com>

Today the cache is an MR-cache, however, all members of MR, except for
mkey, are not being used in the cache.
Therefore, changing it to an mkey-cache so that the cache has its own
memory and holds only the values needed for the cache.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |   4 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  56 ++--
 drivers/infiniband/hw/mlx5/mr.c      | 378 ++++++++++++++-------------
 drivers/infiniband/hw/mlx5/odp.c     |   9 +-
 include/linux/mlx5/driver.h          |   6 +-
 5 files changed, 235 insertions(+), 218 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index c12517b63a8d..849bf016d8ae 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4051,7 +4051,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mr_cache_cleanup(dev);
+	err = mlx5_mkey_cache_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4154,7 +4154,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
-	ret = mlx5_mr_cache_init(dev);
+	ret = mlx5_mkey_cache_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		goto error_4;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index af11a0d8ebc0..ffb6f1d41f3d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -634,6 +634,15 @@ struct mlx5_user_mmap_entry {
 #define mlx5_update_odp_stats(mr, counter_name, value)		\
 	atomic64_add(value, &((mr)->odp_stats.counter_name))
 
+struct mlx5r_cache_mkey {
+	u32 key;
+	struct mlx5_cache_ent *cache_ent;
+	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
+	struct mlx5_async_work cb_work;
+	/* Cache list element */
+	struct list_head list;
+};
+
 struct mlx5r_mkey {
 	u64			iova;
 	u64			size;
@@ -642,6 +651,7 @@ struct mlx5r_mkey {
 	u32			type;
 	struct wait_queue_head wait;
 	refcount_t usecount;
+	struct mlx5_cache_ent *cache_ent;
 };
 
 struct mlx5_ib_mr {
@@ -649,19 +659,10 @@ struct mlx5_ib_mr {
 	struct mlx5r_mkey mmkey;
 
 	/* User MR data */
-	struct mlx5_cache_ent *cache_ent;
 	struct ib_umem *umem;
 
 	/* This is zero'd when the MR is allocated */
 	union {
-		/* Used only while the MR is in the cache */
-		struct {
-			u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
-			struct mlx5_async_work cb_work;
-			/* Cache list element */
-			struct list_head list;
-		};
-
 		/* Used only by kernel MRs (umem == NULL) */
 		struct {
 			void *descs;
@@ -702,12 +703,6 @@ struct mlx5_ib_mr {
 	};
 };
 
-/* Zero the fields in the mr that are variant depending on usage */
-static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr)
-{
-	memset(mr->out, 0, sizeof(*mr) - offsetof(struct mlx5_ib_mr, out));
-}
-
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
 {
 	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
@@ -763,16 +758,16 @@ struct mlx5_cache_ent {
 	u8 fill_to_high_water:1;
 
 	/*
-	 * - available_mrs is the length of list head, ie the number of MRs
+	 * - available_mkeys is the length of list head, ie the number of Mkeys
 	 *   available for immediate allocation.
-	 * - total_mrs is available_mrs plus all in use MRs that could be
+	 * - total_mkeys is available_mkeys plus all in use Mkeys that could be
 	 *   returned to the cache.
-	 * - limit is the low water mark for available_mrs, 2* limit is the
+	 * - limit is the low water mark for available_mkeys, 2* limit is the
 	 *   upper water mark.
-	 * - pending is the number of MRs currently being created
+	 * - pending is the number of Mkeys currently being created
 	 */
-	u32 total_mrs;
-	u32 available_mrs;
+	u32 total_mkeys;
+	u32 available_mkeys;
 	u32 limit;
 	u32 pending;
 
@@ -784,9 +779,9 @@ struct mlx5_cache_ent {
 	struct delayed_work	dwork;
 };
 
-struct mlx5_mr_cache {
+struct mlx5_mkey_cache {
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	struct mlx5_cache_ent	ent[MAX_MKEY_CACHE_ENTRIES];
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -1070,7 +1065,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mr_cache		cache;
+	struct mlx5_mkey_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1318,11 +1313,12 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       unsigned int entry, int access_flags);
+struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
+					   unsigned int entry,
+					   int access_flags);
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
@@ -1346,7 +1342,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1365,7 +1361,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index bb59ea9b0498..8d7de4eddc11 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -54,13 +54,13 @@ static DEFINE_MUTEX(xlt_emergency_page_mutex);
 static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv);
 
 enum {
-	MAX_PENDING_REG_MR = 8,
+	MAX_PENDING_CREATE_MKEY = 8,
 };
 
 #define MLX5_UMR_ALIGN 2048
 
-static void
-create_mkey_callback(int status, struct mlx5_async_work *context);
+static void create_cache_mkey_callback(int status,
+				       struct mlx5_async_work *context);
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 				     u64 iova, int access_flags,
 				     unsigned int page_size, bool populate);
@@ -104,7 +104,6 @@ static void set_mkey_fields(void *mkc, struct mlx5r_mkey *mkey)
 	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
 	mkey->size = MLX5_GET64(mkc, mkc, len);
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
-	init_waitqueue_head(&mkey->wait);
 }
 
 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5r_mkey *mkey,
@@ -120,22 +119,24 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5r_mkey *mkey,
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	set_mkey_fields(mkc, mkey);
+	init_waitqueue_head(&mkey->wait);
 	return 0;
 }
 
-static int mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
-				  struct mlx5r_mkey *mkey,
-				  struct mlx5_async_ctx *async_ctx, u32 *in,
-				  int inlen, u32 *out, int outlen,
-				  struct mlx5_async_work *context)
+static int mlx5_ib_create_cache_mkey_cb(struct mlx5_ib_dev *dev,
+					struct mlx5r_cache_mkey *cmkey,
+					struct mlx5_async_ctx *async_ctx,
+					u32 *in, int inlen, u32 *out,
+					int outlen,
+					struct mlx5_async_work *context)
 {
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
-	assign_mkey_variant(dev, &mkey->key, in);
+	assign_mkey_variant(dev, &cmkey->key, in);
 	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
-				create_mkey_callback, context);
+				create_cache_mkey_callback, context);
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
@@ -150,17 +151,19 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey.key);
 }
 
-static void create_mkey_callback(int status, struct mlx5_async_work *context)
+static void create_cache_mkey_callback(int status,
+				       struct mlx5_async_work *context)
 {
-	struct mlx5_ib_mr *mr =
-		container_of(context, struct mlx5_ib_mr, cb_work);
-	struct mlx5_cache_ent *ent = mr->cache_ent;
+	struct mlx5r_cache_mkey *cmkey =
+		container_of(context, struct mlx5r_cache_mkey, cb_work);
+	struct mlx5_cache_ent *ent = cmkey->cache_ent;
 	struct mlx5_ib_dev *dev = ent->dev;
 	unsigned long flags;
 
 	if (status) {
-		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
-		kfree(mr);
+		mlx5_ib_warn(dev, "async create mkey failed. status %d\n",
+			     status);
+		kfree(cmkey);
 		spin_lock_irqsave(&ent->lock, flags);
 		ent->pending--;
 		WRITE_ONCE(dev->fill_delay, 1);
@@ -169,32 +172,23 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 		return;
 	}
 
-	mr->mmkey.type = MLX5_MKEY_MR;
-	mr->mmkey.key |= mlx5_idx_to_mkey(
-		MLX5_GET(create_mkey_out, mr->out, mkey_index));
-	init_waitqueue_head(&mr->mmkey.wait);
+	cmkey->key |= mlx5_idx_to_mkey(
+		MLX5_GET(create_mkey_out, cmkey->out, mkey_index));
 
 	WRITE_ONCE(dev->cache.last_add, jiffies);
 
 	spin_lock_irqsave(&ent->lock, flags);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
-	ent->total_mrs++;
+	list_add_tail(&cmkey->list, &ent->head);
+	ent->available_mkeys++;
+	ent->total_mkeys++;
 	/* If we are doing fill_to_high_water then keep going. */
 	queue_adjust_cache_locked(ent);
 	ent->pending--;
 	spin_unlock_irqrestore(&ent->lock, flags);
 }
 
-static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
+static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
 {
-	struct mlx5_ib_mr *mr;
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return NULL;
-	mr->cache_ent = ent;
-
 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
@@ -203,14 +197,13 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 
 	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
-	return mr;
 }
 
 /* Asynchronously schedule new MRs to be populated in the cache. */
 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
+	struct mlx5r_cache_mkey *cmkey;
 	void *mkc;
 	u32 *in;
 	int err = 0;
@@ -221,31 +214,33 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 		return -ENOMEM;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_cache_mkc(ent, mkc);
 	for (i = 0; i < num; i++) {
-		mr = alloc_cache_mr(ent, mkc);
-		if (!mr) {
+		cmkey = kzalloc(sizeof(*cmkey), GFP_KERNEL);
+		if (!cmkey) {
 			err = -ENOMEM;
 			break;
 		}
+		cmkey->cache_ent = ent;
+
 		spin_lock_irq(&ent->lock);
-		if (ent->pending >= MAX_PENDING_REG_MR) {
+		if (ent->pending >= MAX_PENDING_CREATE_MKEY) {
 			err = -EAGAIN;
 			spin_unlock_irq(&ent->lock);
-			kfree(mr);
+			kfree(cmkey);
 			break;
 		}
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
-		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
-					     &ent->dev->async_ctx, in, inlen,
-					     mr->out, sizeof(mr->out),
-					     &mr->cb_work);
+		err = mlx5_ib_create_cache_mkey_cb(
+			ent->dev, cmkey, &ent->dev->async_ctx, in, inlen,
+			cmkey->out, sizeof(cmkey->out), &cmkey->cb_work);
 		if (err) {
 			spin_lock_irq(&ent->lock);
 			ent->pending--;
 			spin_unlock_irq(&ent->lock);
 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
-			kfree(mr);
+			kfree(cmkey);
 			break;
 		}
 	}
@@ -255,63 +250,54 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 }
 
 /* Synchronously create a MR in the cache */
-static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
+static int create_cacheable_mkey(struct mlx5_cache_ent *ent,
+				 struct mlx5r_mkey *mkey)
 {
 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-
-	mr = alloc_cache_mr(ent, mkc);
-	if (!mr) {
-		err = -ENOMEM;
-		goto free_in;
+	set_cache_mkc(ent, mkc);
+	err = mlx5_core_create_mkey(ent->dev->mdev, &mkey->key, in, inlen);
+	if (err) {
+		kfree(in);
+		return err;
 	}
+	set_mkey_fields(mkc, mkey);
+	mkey->cache_ent = ent;
 
-	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
-	if (err)
-		goto free_mr;
-	set_mkey_fields(mkc, &mr->mmkey);
-
-	mr->mmkey.type = MLX5_MKEY_MR;
 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 	spin_lock_irq(&ent->lock);
-	ent->total_mrs++;
+	ent->total_mkeys++;
 	spin_unlock_irq(&ent->lock);
 	kfree(in);
-	return mr;
-free_mr:
-	kfree(mr);
-free_in:
-	kfree(in);
-	return ERR_PTR(err);
+	return 0;
 }
 
-static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
+static void remove_cache_mkey_locked(struct mlx5_cache_ent *ent)
 {
-	struct mlx5_ib_mr *mr;
+	struct mlx5r_cache_mkey *cmkey;
 
 	lockdep_assert_held(&ent->lock);
 	if (list_empty(&ent->head))
 		return;
-	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-	list_del(&mr->list);
-	ent->available_mrs--;
-	ent->total_mrs--;
+	cmkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey, list);
+	list_del(&cmkey->list);
+	ent->available_mkeys--;
+	ent->total_mkeys--;
 	spin_unlock_irq(&ent->lock);
-	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey.key);
-	kfree(mr);
+	mlx5_core_destroy_mkey(ent->dev->mdev, &cmkey->key);
+	kfree(cmkey);
 	spin_lock_irq(&ent->lock);
 }
 
-static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
-				bool limit_fill)
+static int resize_available_mkeys(struct mlx5_cache_ent *ent,
+				  unsigned int target, bool limit_fill)
 {
 	int err;
 
@@ -320,10 +306,11 @@ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 	while (true) {
 		if (limit_fill)
 			target = ent->limit * 2;
-		if (target == ent->available_mrs + ent->pending)
+		if (target == ent->available_mkeys + ent->pending)
 			return 0;
-		if (target > ent->available_mrs + ent->pending) {
-			u32 todo = target - (ent->available_mrs + ent->pending);
+		if (target > ent->available_mkeys + ent->pending) {
+			u32 todo =
+				target - (ent->available_mkeys + ent->pending);
 
 			spin_unlock_irq(&ent->lock);
 			err = add_keys(ent, todo);
@@ -336,7 +323,7 @@ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 			} else
 				return 0;
 		} else {
-			remove_cache_mr_locked(ent);
+			remove_cache_mkey_locked(ent);
 		}
 	}
 }
@@ -353,21 +340,21 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 		return err;
 
 	/*
-	 * Target is the new value of total_mrs the user requests, however we
-	 * cannot free MRs that are in use. Compute the target value for
-	 * available_mrs.
+	 * Target is the new value of total_mkeys the user requests, however we
+	 * cannot free Mkeys that are in use. Compute the target value for
+	 * available_mkeys.
 	 */
 	spin_lock_irq(&ent->lock);
-	if (target < ent->total_mrs - ent->available_mrs) {
+	if (target < ent->total_mkeys - ent->available_mkeys) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	target = target - (ent->total_mrs - ent->available_mrs);
+	target = target - (ent->total_mkeys - ent->available_mkeys);
 	if (target < ent->limit || target > ent->limit*2) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	err = resize_available_mrs(ent, target, false);
+	err = resize_available_mkeys(ent, target, false);
 	if (err)
 		goto err_unlock;
 	spin_unlock_irq(&ent->lock);
@@ -386,7 +373,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 	char lbuf[20];
 	int err;
 
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mkeys);
 	if (err < 0)
 		return err;
 
@@ -417,7 +404,7 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	 */
 	spin_lock_irq(&ent->lock);
 	ent->limit = var;
-	err = resize_available_mrs(ent, 0, true);
+	err = resize_available_mkeys(ent, 0, true);
 	spin_unlock_irq(&ent->lock);
 	if (err)
 		return err;
@@ -445,16 +432,16 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mkey_cache *cache)
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &cache->ent[i];
 		bool ret;
 
 		spin_lock_irq(&ent->lock);
-		ret = ent->available_mrs < ent->limit;
+		ret = ent->available_mkeys < ent->limit;
 		spin_unlock_irq(&ent->lock);
 		if (ret)
 			return true;
@@ -473,19 +460,19 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 
 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 		return;
-	if (ent->available_mrs < ent->limit) {
+	if (ent->available_mkeys < ent->limit) {
 		ent->fill_to_high_water = true;
 		queue_work(ent->dev->cache.wq, &ent->work);
 	} else if (ent->fill_to_high_water &&
-		   ent->available_mrs + ent->pending < 2 * ent->limit) {
+		   ent->available_mkeys + ent->pending < 2 * ent->limit) {
 		/*
 		 * Once we start populating due to hitting a low water mark
 		 * continue until we pass the high water mark.
 		 */
 		queue_work(ent->dev->cache.wq, &ent->work);
-	} else if (ent->available_mrs == 2 * ent->limit) {
+	} else if (ent->available_mkeys == 2 * ent->limit) {
 		ent->fill_to_high_water = false;
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->available_mkeys > 2 * ent->limit) {
 		/* Queue deletion of excess entries */
 		ent->fill_to_high_water = false;
 		if (ent->pending)
@@ -499,7 +486,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	int err;
 
 	spin_lock_irq(&ent->lock);
@@ -507,7 +494,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		goto out;
 
 	if (ent->fill_to_high_water &&
-	    ent->available_mrs + ent->pending < 2 * ent->limit &&
+	    ent->available_mkeys + ent->pending < 2 * ent->limit &&
 	    !READ_ONCE(dev->fill_delay)) {
 		spin_unlock_irq(&ent->lock);
 		err = add_keys(ent, 1);
@@ -529,7 +516,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 						   msecs_to_jiffies(1000));
 			}
 		}
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->available_mkeys > 2 * ent->limit) {
 		bool need_delay;
 
 		/*
@@ -553,7 +540,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 			goto out;
 		if (need_delay)
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
-		remove_cache_mr_locked(ent);
+		remove_cache_mkey_locked(ent);
 		queue_adjust_cache_locked(ent);
 	}
 out:
@@ -576,15 +563,17 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
-/* Allocate a special entry from the cache */
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       unsigned int entry, int access_flags)
+/* Get an Mkey from a special cache entry */
+struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
+					   unsigned int entry, int access_flags)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
+	struct mlx5r_cache_mkey *cmkey;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
+	int err;
 
-	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
+	if (WARN_ON(entry <= MKEY_CACHE_LAST_STD_ENTRY ||
 		    entry >= ARRAY_SIZE(cache->ent)))
 		return ERR_PTR(-EINVAL);
 
@@ -592,48 +581,58 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
 	ent = &cache->ent[entry];
 	spin_lock_irq(&ent->lock);
 	if (list_empty(&ent->head)) {
 		spin_unlock_irq(&ent->lock);
-		mr = create_cache_mr(ent);
-		if (IS_ERR(mr))
-			return mr;
+		err = create_cacheable_mkey(ent, &mr->mmkey);
+		if (err) {
+			kfree(mr);
+			return ERR_PTR(err);
+		}
 	} else {
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_del(&mr->list);
-		ent->available_mrs--;
+		cmkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey,
+					 list);
+		list_del(&cmkey->list);
+		ent->available_mkeys--;
 		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
 
-		mlx5_clear_mr(mr);
+		mr->mmkey.key = cmkey->key;
+		mr->mmkey.cache_ent = ent;
+		kfree(cmkey);
 	}
+	init_waitqueue_head(&mr->mmkey.wait);
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->access_flags = access_flags;
 	return mr;
 }
 
-/* Return a MR already available in the cache */
-static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
+/* Return a Mkey already available in the cache */
+static struct mlx5r_cache_mkey *get_cache_mkey(struct mlx5_cache_ent *req_ent)
 {
 	struct mlx5_ib_dev *dev = req_ent->dev;
-	struct mlx5_ib_mr *mr = NULL;
 	struct mlx5_cache_ent *ent = req_ent;
+	struct mlx5r_cache_mkey *cmkey;
 
-	/* Try larger MR pools from the cache to satisfy the allocation */
-	for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
+	/* Try larger Mkey pools from the cache to satisfy the allocation */
+	for (; ent != &dev->cache.ent[MKEY_CACHE_LAST_STD_ENTRY + 1]; ent++) {
 		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
 			    ent - dev->cache.ent);
 
 		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
-			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
-					      list);
-			list_del(&mr->list);
-			ent->available_mrs--;
+			cmkey = list_first_entry(&ent->head,
+						 struct mlx5r_cache_mkey, list);
+			list_del(&cmkey->list);
+			ent->available_mkeys--;
 			queue_adjust_cache_locked(ent);
 			spin_unlock_irq(&ent->lock);
-			mlx5_clear_mr(mr);
-			return mr;
+			return cmkey;
 		}
 		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
@@ -642,23 +641,32 @@ static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
 	return NULL;
 }
 
-static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+static int mlx5_free_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	struct mlx5_cache_ent *ent = mr->cache_ent;
+	struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+	struct mlx5r_mkey *mkey = &mr->mmkey;
+	struct mlx5r_cache_mkey *cmkey;
+
+	cmkey = kzalloc(sizeof(*cmkey), GFP_KERNEL);
+	if (!cmkey)
+		return -ENOMEM;
+
+	cmkey->key = mkey->key;
+	cmkey->cache_ent = ent;
 
 	spin_lock_irq(&ent->lock);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
+	list_add_tail(&cmkey->list, &ent->head);
+	ent->available_mkeys++;
 	queue_adjust_cache_locked(ent);
 	spin_unlock_irq(&ent->lock);
+	return 0;
 }
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
-	struct mlx5_ib_mr *tmp_mr;
-	struct mlx5_ib_mr *mr;
+	struct mlx5r_cache_mkey *tmp_mkey, *mkey;
 	LIST_HEAD(del_list);
 
 	cancel_delayed_work(&ent->dwork);
@@ -668,21 +676,22 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 			spin_unlock_irq(&ent->lock);
 			break;
 		}
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_move(&mr->list, &del_list);
-		ent->available_mrs--;
-		ent->total_mrs--;
+		mkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey,
+					list);
+		list_move(&mkey->list, &del_list);
+		ent->available_mkeys--;
+		ent->total_mkeys--;
 		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey.key);
+		mlx5_core_destroy_mkey(dev->mdev, &mkey->key);
 	}
 
-	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
-		list_del(&mr->list);
-		kfree(mr);
+	list_for_each_entry_safe(mkey, tmp_mkey, &del_list, list) {
+		list_del(&mkey->list);
+		kfree(mkey);
 	}
 }
 
-static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -691,9 +700,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct dentry *dir;
 	int i;
@@ -703,13 +712,13 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 
 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		sprintf(ent->name, "%d", ent->order);
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
+		debugfs_create_u32("cur", 0400, dir, &ent->available_mkeys);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
 }
@@ -721,9 +730,9 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int i;
 
@@ -736,7 +745,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		INIT_LIST_HEAD(&ent->head);
 		spin_lock_init(&ent->lock);
@@ -747,12 +756,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		INIT_WORK(&ent->work, cache_work_func);
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
-		if (i > MR_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mr_cache_entry(ent);
+		if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+			mlx5_odp_init_mkey_cache_entry(ent);
 			continue;
 		}
 
-		if (ent->order > mr_cache_max_order(dev))
+		if (ent->order > mkey_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
@@ -770,19 +779,19 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		spin_unlock_irq(&ent->lock);
 	}
 
-	mlx5_mr_cache_debugfs_init(dev);
+	mlx5_mkey_cache_debugfs_init(dev);
 
 	return 0;
 }
 
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 
 		spin_lock_irq(&ent->lock);
@@ -792,10 +801,10 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
@@ -862,10 +871,10 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MR_CACHE_LAST_STD_ENTRY + 2;
+		return MKEY_CACHE_LAST_STD_ENTRY + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
@@ -912,15 +921,15 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 	return err;
 }
 
-static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
-						      unsigned int order)
+static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+							unsigned int order)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 
 	if (order < cache->ent[0].order)
 		return &cache->ent[0];
 	order = order - cache->ent[0].order;
-	if (order > MR_CACHE_LAST_STD_ENTRY)
+	if (order > MKEY_CACHE_LAST_STD_ENTRY)
 		return NULL;
 	return &cache->ent[order];
 }
@@ -951,9 +960,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 					     int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5r_cache_mkey *cmkey;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 	unsigned int page_size;
+	int ret;
 
 	if (umem->is_dmabuf)
 		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
@@ -962,7 +973,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mr_cache_ent_from_order(
+	ent = mkey_cache_ent_from_order(
 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
@@ -976,22 +987,33 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 		return mr;
 	}
 
-	mr = get_cache_mr(ent);
-	if (!mr) {
-		mr = create_cache_mr(ent);
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	cmkey = get_cache_mkey(ent);
+	if (cmkey) {
+		mr->mmkey.key = cmkey->key;
+		mr->mmkey.cache_ent = cmkey->cache_ent;
+		kfree(cmkey);
+	} else {
+		ret = create_cacheable_mkey(ent, &mr->mmkey);
 		/*
 		 * The above already tried to do the same stuff as reg_create(),
 		 * no reason to try it again.
 		 */
-		if (IS_ERR(mr))
-			return mr;
+		if (ret) {
+			kfree(mr);
+			return ERR_PTR(ret);
+		}
 	}
-
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
 	mr->mmkey.iova = iova;
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->mmkey.size = umem->length;
 	mr->mmkey.pd = to_mpd(pd)->pdn;
+	init_waitqueue_head(&mr->mmkey.wait);
 	mr->page_shift = order_base_2(page_size);
 	set_mr_fields(dev, mr, umem->length, access_flags);
 
@@ -1742,7 +1764,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
 
 	/* We only track the allocated sizes of MRs from the cache */
-	if (!mr->cache_ent)
+	if (!mr->mmkey.cache_ent)
 		return false;
 	if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
 		return false;
@@ -1751,7 +1773,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
 	if (WARN_ON(!*page_size))
 		return false;
-	return (1ULL << mr->cache_ent->order) >=
+	return (1ULL << mr->mmkey.cache_ent->order) >=
 	       ib_umem_num_dma_blocks(new_umem, *page_size);
 }
 
@@ -1997,15 +2019,15 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	}
 
 	/* Stop DMA */
-	if (mr->cache_ent) {
-		if (revoke_mr(mr)) {
-			spin_lock_irq(&mr->cache_ent->lock);
-			mr->cache_ent->total_mrs--;
-			spin_unlock_irq(&mr->cache_ent->lock);
-			mr->cache_ent = NULL;
+	if (mr->mmkey.cache_ent) {
+		if (revoke_mr(mr) || mlx5_free_mkey(dev, mr)) {
+			spin_lock_irq(&mr->mmkey.cache_ent->lock);
+			mr->mmkey.cache_ent->total_mkeys--;
+			spin_unlock_irq(&mr->mmkey.cache_ent->lock);
+			mr->mmkey.cache_ent = NULL;
 		}
 	}
-	if (!mr->cache_ent) {
+	if (!mr->mmkey.cache_ent) {
 		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
 		if (rc)
 			return rc;
@@ -2022,12 +2044,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 			mlx5_ib_free_odp_mr(mr);
 	}
 
-	if (mr->cache_ent) {
-		mlx5_mr_cache_free(dev, mr);
-	} else {
+	if (!mr->mmkey.cache_ent)
 		mlx5_free_priv_descs(mr);
-		kfree(mr);
-	}
+
+	kfree(mr);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index bc35900c6955..9c7942118d2c 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -418,8 +418,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	if (IS_ERR(odp))
 		return ERR_CAST(odp);
 
-	mr = mlx5_mr_cache_alloc(
-		mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags);
+	mr = mlx5_alloc_special_mkey(mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY,
+				     imr->access_flags);
 	if (IS_ERR(mr)) {
 		ib_umem_odp_release(odp);
 		return mr;
@@ -493,7 +493,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (IS_ERR(umem_odp))
 		return ERR_CAST(umem_odp);
 
-	imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
+	imr = mlx5_alloc_special_mkey(dev, MLX5_IMR_KSM_CACHE_ENTRY,
+				      access_flags);
 	if (IS_ERR(imr)) {
 		ib_umem_odp_release(umem_odp);
 		return imr;
@@ -1604,7 +1605,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
 {
 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 		return;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5832d6614606..8191140454e1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1077,10 +1077,10 @@ enum {
 };
 
 enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
+	MKEY_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
+	MAX_MKEY_CACHE_ENTRIES
 };
 
 /* Async-atomic event notifier used by mlx5 core to forward FW
@@ -1142,7 +1142,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 enum {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree
  2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
                   ` (2 preceding siblings ...)
  2021-06-22 12:08 ` [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs Leon Romanovsky
@ 2021-06-22 12:08 ` Leon Romanovsky
  2021-07-29 19:45   ` Jason Gunthorpe
  2021-06-22 12:08 ` [PATCH rdma-next 5/5] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky
  4 siblings, 1 reply; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Aharon Landau, Jakub Kicinski, Jason Wang, linux-kernel,
	linux-rdma, Michael S. Tsirkin, netdev, Saeed Mahameed,
	Shay Drory, virtualization

From: Aharon Landau <aharonl@nvidia.com>

Currently, the cache structure is a linear array held within
mlx5_ib_dev. Each entry in the array holds a list_head of mkeys whose
sizes are the order of the entry they are in (i.e. in entry number 2
there will be mkeys of size 4, at entry 3 mkeys of size 8, and so on).
The access flags of all cached mkeys are
IB_ACCESS_DISABLE_RELAXED_ORDERING.

This structure does not allow adding new entries to the cache.
Therefore, the cache can only hold mkeys that meet the above conditions
(size of some power of 2 and access_flag =
IB_ACCESS_DISABLE_RELAXED_ORDERING).
Later in the series, we would like to allow caching mkeys with different
sizes and different access_flag. Adapting the cache structure for this
purpose.

Change the cache structure to an RB-tree, where every node is an entry
that holds an mkeys list. The tree key is the access_flag as MSB and the
size of mkey as LSB. mlx5_ib_dev will hold the root of the tree.
When initializing a device, the default entries will be generated, that
is, entries for mkeys' size = 2^x and access_flag =
IB_ACCESS_DISABLE_RELAXED_ORDERING.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |   4 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  34 +++-
 drivers/infiniband/hw/mlx5/mr.c      | 271 ++++++++++++++++++++-------
 drivers/infiniband/hw/mlx5/odp.c     |  43 +++--
 include/linux/mlx5/driver.h          |   4 +-
 5 files changed, 261 insertions(+), 95 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 849bf016d8ae..c46581686258 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4051,7 +4051,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mkey_cache_cleanup(dev);
+	err = mlx5_mkey_cache_tree_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4154,7 +4154,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
-	ret = mlx5_mkey_cache_init(dev);
+	ret = mlx5_mkey_cache_tree_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		goto error_4;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ffb6f1d41f3d..e22eeceae9eb 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -749,7 +749,7 @@ struct mlx5_cache_ent {
 
 
 	char                    name[4];
-	u32                     order;
+	u32			order;
 	u32			xlt;
 	u32			access_mode;
 	u32			page;
@@ -777,11 +777,22 @@ struct mlx5_cache_ent {
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
+
+	struct rb_node		node;
+	unsigned int		entry_flags;
+};
+
+enum {
+	MLX5_CACHE_ENTRY_FLAG_IMR_MTT = (1 << 0),
+	MLX5_CACHE_ENTRY_FLAG_IMR_KSM = (1 << 1),
+	MLX5_CACHE_ENTRY_FLAG_REMOTE_ATOMIC = (1 << 2),
+	MLX5_CACHE_ENTRY_FLAG_RELAXED_ORDERING = (1 << 3),
 };
 
-struct mlx5_mkey_cache {
+struct mlx5_mkey_cache_tree {
+	struct rb_root		cache_root;
+	struct mutex		cache_lock;
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MKEY_CACHE_ENTRIES];
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -1065,7 +1076,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mkey_cache		cache;
+	struct mlx5_mkey_cache_tree	cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1313,8 +1324,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_tree_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_tree_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
 					   unsigned int entry,
@@ -1335,6 +1346,9 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 				struct ib_dm_mr_attr *attr,
 				struct uverbs_attr_bundle *attrs);
+struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
+						int entry_flags, int size,
+						int order);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
@@ -1342,7 +1356,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
+int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev, int ent_num);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1361,7 +1375,11 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev,
+						 int ent_num)
+{
+	return 0;
+}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8d7de4eddc11..7c67aa4f1f1e 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -432,20 +432,30 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mkey_cache *cache)
+static bool is_special_ent(int ent_flags)
 {
-	unsigned int i;
+	return ent_flags &
+	       (MLX5_CACHE_ENTRY_FLAG_IMR_MTT | MLX5_CACHE_ENTRY_FLAG_IMR_KSM);
+}
 
-	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-		struct mlx5_cache_ent *ent = &cache->ent[i];
-		bool ret;
+static bool someone_adding(struct mlx5_mkey_cache_tree *cache)
+{
+	struct mlx5_cache_ent *ent;
+	struct rb_node *node;
+	bool ret;
 
+	mutex_lock(&cache->cache_lock);
+	for (node = rb_first(&cache->cache_root); node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
 		spin_lock_irq(&ent->lock);
 		ret = ent->available_mkeys < ent->limit;
 		spin_unlock_irq(&ent->lock);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&cache->cache_lock);
 			return true;
+		}
 	}
+	mutex_unlock(&cache->cache_lock);
 	return false;
 }
 
@@ -486,7 +496,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mkey_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache_tree *cache = &dev->cache;
 	int err;
 
 	spin_lock_irq(&ent->lock);
@@ -563,29 +573,142 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
+static int mlx5_ent_access_flags(struct mlx5_ib_dev *dev, int access_flags)
+{
+	int ret = 0;
+
+	if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	    MLX5_CAP_GEN(dev->mdev, atomic) &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+		ret |= MLX5_CACHE_ENTRY_FLAG_REMOTE_ATOMIC;
+
+	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		ret |= MLX5_CACHE_ENTRY_FLAG_RELAXED_ORDERING;
+
+	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		ret |= MLX5_CACHE_ENTRY_FLAG_RELAXED_ORDERING;
+
+	return ret;
+
+}
+
+static int ent_insert(struct mlx5_mkey_cache_tree *cache,
+		      struct mlx5_cache_ent *ent)
+{
+	struct rb_node **new = &cache->cache_root.rb_node, *parent = NULL;
+	struct mlx5_cache_ent *this;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		this = container_of(*new, struct mlx5_cache_ent, node);
+		parent = *new;
+		if (ent->entry_flags < this->entry_flags)
+			new = &((*new)->rb_left);
+		else if (ent->entry_flags > this->entry_flags)
+			new = &((*new)->rb_right);
+		else {
+			if (ent->xlt < this->xlt)
+				new = &((*new)->rb_left);
+			else if (ent->xlt > this->xlt)
+				new = &((*new)->rb_right);
+			else
+				return -EEXIST;
+		}
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&ent->node, parent, new);
+	rb_insert_color(&ent->node, &cache->cache_root);
+
+	return 0;
+}
+
+struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
+						int entry_flags, int xlt_size,
+						int order)
+{
+	struct mlx5_cache_ent *ent;
+	int ret;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&ent->head);
+	spin_lock_init(&ent->lock);
+	ent->entry_flags = entry_flags;
+	ent->xlt = xlt_size;
+	ent->order = order;
+	ent->dev = dev;
+
+	INIT_WORK(&ent->work, cache_work_func);
+	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+
+	mutex_lock(&dev->cache.cache_lock);
+	ret = ent_insert(&dev->cache, ent);
+	mutex_unlock(&dev->cache.cache_lock);
+	if (ret) {
+		kfree(ent);
+		return ERR_PTR(ret);
+	}
+	return ent;
+}
+
+static struct mlx5_cache_ent *mkey_cache_ent_from_size(struct mlx5_ib_dev *dev,
+						       int ent_flags, int size)
+{
+	struct rb_node *node = dev->cache.cache_root.rb_node;
+	struct mlx5_cache_ent *cur, *prev = NULL;
+
+	WARN_ON(!mutex_is_locked(&dev->cache.cache_lock));
+	while (node) {
+		cur = container_of(node, struct mlx5_cache_ent, node);
+
+		if (cur->entry_flags > ent_flags)
+			node = node->rb_left;
+		else if (cur->entry_flags < ent_flags)
+			node = node->rb_right;
+		else {
+			if (cur->xlt > size) {
+				prev = cur;
+				node = node->rb_left;
+			} else if (cur->xlt < size)
+				node = node->rb_right;
+			else
+				return cur;
+		}
+	}
+	return prev;
+}
+
 /* Get an Mkey from a special cache entry */
 struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
 					   unsigned int entry, int access_flags)
 {
-	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5r_cache_mkey *cmkey;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
+	int ent_flags;
 	int err;
 
-	if (WARN_ON(entry <= MKEY_CACHE_LAST_STD_ENTRY ||
-		    entry >= ARRAY_SIZE(cache->ent)))
+	if (WARN_ON(!is_special_ent(entry)))
 		return ERR_PTR(-EINVAL);
 
-	/* Matches access in alloc_cache_mr() */
-	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
+	ent_flags = entry | mlx5_ent_access_flags(dev, access_flags);
+
+	mutex_lock(&dev->cache.cache_lock);
+	ent = mkey_cache_ent_from_size(dev, ent_flags, 0);
+	mutex_unlock(&dev->cache.cache_lock);
+	if (!ent)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	ent = &cache->ent[entry];
 	spin_lock_irq(&ent->lock);
 	if (list_empty(&ent->head)) {
 		spin_unlock_irq(&ent->lock);
@@ -616,13 +739,18 @@ struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
 static struct mlx5r_cache_mkey *get_cache_mkey(struct mlx5_cache_ent *req_ent)
 {
 	struct mlx5_ib_dev *dev = req_ent->dev;
-	struct mlx5_cache_ent *ent = req_ent;
 	struct mlx5r_cache_mkey *cmkey;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *node;
 
 	/* Try larger Mkey pools from the cache to satisfy the allocation */
-	for (; ent != &dev->cache.ent[MKEY_CACHE_LAST_STD_ENTRY + 1]; ent++) {
-		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
-			    ent - dev->cache.ent);
+	mutex_lock(&dev->cache.cache_lock);
+	for (node = &req_ent->node; node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
+
+		if (ent->entry_flags != req_ent->entry_flags)
+			break;
+		mlx5_ib_dbg(dev, "size %d\n", ent->xlt);
 
 		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
@@ -632,11 +760,13 @@ static struct mlx5r_cache_mkey *get_cache_mkey(struct mlx5_cache_ent *req_ent)
 			ent->available_mkeys--;
 			queue_adjust_cache_locked(ent);
 			spin_unlock_irq(&ent->lock);
+			mutex_unlock(&dev->cache.cache_lock);
 			return cmkey;
 		}
 		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
 	}
+	mutex_unlock(&dev->cache.cache_lock);
 	req_ent->miss++;
 	return NULL;
 }
@@ -662,10 +792,8 @@ static int mlx5_free_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	return 0;
 }
 
-static void clean_keys(struct mlx5_ib_dev *dev, int c)
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
 {
-	struct mlx5_mkey_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
 	struct mlx5r_cache_mkey *tmp_mkey, *mkey;
 	LIST_HEAD(del_list);
 
@@ -691,7 +819,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 	}
 }
 
-static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_tree_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -700,20 +828,25 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_tree_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mkey_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache_tree *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
+	struct rb_node *node;
 	struct dentry *dir;
-	int i;
 
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
 
 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 
-	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
+	mutex_lock(&dev->cache.cache_lock);
+	for (node = rb_first(&cache->cache_root); node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
+
+		if (!ent->order)
+			continue;
+
 		sprintf(ent->name, "%d", ent->order);
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
@@ -721,6 +854,7 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 		debugfs_create_u32("cur", 0400, dir, &ent->available_mkeys);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
+	mutex_unlock(&dev->cache.cache_lock);
 }
 
 static void delay_time_func(struct timer_list *t)
@@ -730,13 +864,16 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_tree_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mkey_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache_tree *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
+	int err;
 	int i;
 
 	mutex_init(&dev->slow_path_mutex);
+	mutex_init(&cache->cache_lock);
+	cache->cache_root = RB_ROOT;
 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 	if (!cache->wq) {
 		mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -745,28 +882,25 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
-		INIT_LIST_HEAD(&ent->head);
-		spin_lock_init(&ent->lock);
-		ent->order = i + 2;
-		ent->dev = dev;
-		ent->limit = 0;
-
-		INIT_WORK(&ent->work, cache_work_func);
-		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+	for (i = 0; i < MAX_MKEY_CACHE_DEFAULT_ENTRIES; i++) {
+		u8 order = i + 2;
+		u32 xlt_size = (1 << order) * sizeof(struct mlx5_mtt) /
+			       MLX5_IB_UMR_OCTOWORD;
 
 		if (i > MKEY_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mkey_cache_entry(ent);
+			err = mlx5_odp_init_mkey_cache_entry(dev, i);
+			if (err)
+				return err;
 			continue;
 		}
 
-		if (ent->order > mkey_cache_max_order(dev))
+		ent = mlx5_ib_create_cache_ent(dev, 0, xlt_size, order);
+		if (IS_ERR(ent))
+			return PTR_ERR(ent);
+		if (order > mkey_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
-		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
-			   MLX5_IB_UMR_OCTOWORD;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
@@ -778,22 +912,22 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
 	}
-
-	mlx5_mkey_cache_debugfs_init(dev);
-
+	mlx5_mkey_cache_tree_debugfs_init(dev);
 	return 0;
 }
 
-int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_tree_cleanup(struct mlx5_ib_dev *dev)
 {
-	unsigned int i;
+	struct rb_root *root = &dev->cache.cache_root;
+	struct mlx5_cache_ent *ent, *tmp;
+	struct rb_node *node;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
-		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
-
+	mutex_lock(&dev->cache.cache_lock);
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
 		spin_lock_irq(&ent->lock);
 		ent->disabled = true;
 		spin_unlock_irq(&ent->lock);
@@ -801,11 +935,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mkey_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_tree_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
-		clean_keys(dev, i);
+	rbtree_postorder_for_each_entry_safe(ent, tmp, root, node) {
+		clean_keys(dev, ent);
+		rb_erase(&ent->node, root);
+		kfree(ent);
+	}
+	mutex_unlock(&dev->cache.cache_lock);
 
 	destroy_workqueue(dev->cache.wq);
 	del_timer_sync(&dev->delay_timer);
@@ -921,19 +1059,6 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 	return err;
 }
 
-static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
-							unsigned int order)
-{
-	struct mlx5_mkey_cache *cache = &dev->cache;
-
-	if (order < cache->ent[0].order)
-		return &cache->ent[0];
-	order = order - cache->ent[0].order;
-	if (order > MKEY_CACHE_LAST_STD_ENTRY)
-		return NULL;
-	return &cache->ent[order];
-}
-
 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 			  u64 length, int access_flags)
 {
@@ -964,6 +1089,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 	unsigned int page_size;
+	int ent_flags;
+	int xlt_size;
 	int ret;
 
 	if (umem->is_dmabuf)
@@ -973,14 +1100,16 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mkey_cache_ent_from_order(
-		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
+	ent_flags = mlx5_ent_access_flags(dev, access_flags);
+	xlt_size = get_octo_len(iova, umem->length, order_base_2(page_size));
+	mutex_lock(&dev->cache.cache_lock);
+	ent = mkey_cache_ent_from_size(dev, ent_flags, xlt_size);
+	mutex_unlock(&dev->cache.cache_lock);
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
 	 * cache then synchronously create an uncached one.
 	 */
-	if (!ent || ent->limit == 0 ||
-	    !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
+	if (!ent || ent->limit == 0) {
 		mutex_lock(&dev->slow_path_mutex);
 		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
 		mutex_unlock(&dev->slow_path_mutex);
@@ -1774,7 +1903,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 	if (WARN_ON(!*page_size))
 		return false;
 	return (1ULL << mr->mmkey.cache_ent->order) >=
-	       ib_umem_num_dma_blocks(new_umem, *page_size);
+			       ib_umem_num_dma_blocks(new_umem, *page_size);
 }
 
 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 9c7942118d2c..e4a78b4c6034 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -418,7 +418,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	if (IS_ERR(odp))
 		return ERR_CAST(odp);
 
-	mr = mlx5_alloc_special_mkey(mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY,
+	mr = mlx5_alloc_special_mkey(mr_to_mdev(imr),
+				     MLX5_CACHE_ENTRY_FLAG_IMR_MTT,
 				     imr->access_flags);
 	if (IS_ERR(mr)) {
 		ib_umem_odp_release(odp);
@@ -493,7 +494,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (IS_ERR(umem_odp))
 		return ERR_CAST(umem_odp);
 
-	imr = mlx5_alloc_special_mkey(dev, MLX5_IMR_KSM_CACHE_ENTRY,
+	imr = mlx5_alloc_special_mkey(dev, MLX5_CACHE_ENTRY_FLAG_IMR_KSM,
 				      access_flags);
 	if (IS_ERR(imr)) {
 		ib_umem_odp_release(umem_odp);
@@ -1605,30 +1606,48 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
+int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev, int ent_num)
 {
-	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
-		return;
+	struct mlx5_cache_ent *ent;
+	int ent_flags;
+	u32 xlt_size;
+
+	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+		return 0;
 
-	switch (ent->order - 2) {
+	switch (ent_num) {
 	case MLX5_IMR_MTT_CACHE_ENTRY:
-		ent->page = PAGE_SHIFT;
-		ent->xlt = MLX5_IMR_MTT_ENTRIES *
-			   sizeof(struct mlx5_mtt) /
+		xlt_size = MLX5_IMR_MTT_ENTRIES * sizeof(struct mlx5_mtt) /
 			   MLX5_IB_UMR_OCTOWORD;
+		ent_flags = MLX5_CACHE_ENTRY_FLAG_IMR_MTT;
+
+		ent = mlx5_ib_create_cache_ent(dev, ent_flags, xlt_size,
+					       ent_num + 2);
+		if (IS_ERR(ent))
+			return PTR_ERR(ent);
+
+		ent->page = PAGE_SHIFT;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		ent->limit = 0;
 		break;
 
 	case MLX5_IMR_KSM_CACHE_ENTRY:
-		ent->page = MLX5_KSM_PAGE_SHIFT;
-		ent->xlt = mlx5_imr_ksm_entries *
-			   sizeof(struct mlx5_klm) /
+		xlt_size = mlx5_imr_ksm_entries * sizeof(struct mlx5_klm) /
 			   MLX5_IB_UMR_OCTOWORD;
+		ent_flags = MLX5_CACHE_ENTRY_FLAG_IMR_KSM;
+
+		ent = mlx5_ib_create_cache_ent(dev, ent_flags, xlt_size,
+					       ent_num + 2);
+		if (IS_ERR(ent))
+			return PTR_ERR(ent);
+
+		ent->page = MLX5_KSM_PAGE_SHIFT;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
 		ent->limit = 0;
 		break;
 	}
+
+	return 0;
 }
 
 static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8191140454e1..bb459a2ca18c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1080,7 +1080,7 @@ enum {
 	MKEY_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MKEY_CACHE_ENTRIES
+	MAX_MKEY_CACHE_DEFAULT_ENTRIES
 };
 
 /* Async-atomic event notifier used by mlx5 core to forward FW
@@ -1142,7 +1142,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_DEFAULT_ENTRIES];
 };
 
 enum {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH rdma-next 5/5] RDMA/mlx5: Delay the deregistration of a non-cache mkey
  2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
                   ` (3 preceding siblings ...)
  2021-06-22 12:08 ` [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree Leon Romanovsky
@ 2021-06-22 12:08 ` Leon Romanovsky
  4 siblings, 0 replies; 12+ messages in thread
From: Leon Romanovsky @ 2021-06-22 12:08 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Aharon Landau, David S. Miller, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

From: Aharon Landau <aharonl@nvidia.com>

When restarting an application with a large number of MRs with
non-cached mkeys, all the mkeys will be destroyed and then recreated.
This process takes a long time (about 1.4 for deregistration and 2.3
seconds for the registration of 10,000 MRs).

To shorten the restart runtime, insert the mkeys temporarily into the
cache and schedule a delayed work to destroy them later.
If an application is restarted, the mkeys will still be in the cache
when trying to reg them again, therefore, the registration will be faster
(about 0.7 for deregistration and 0.9 for registration of 10,000 MRs).

If 30 seconds have passed and no user reclaimed the temporarily cached
mkeys, the scheduled work will destroy them.

The above results are from a machine with:
Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz 8 cores.
ConnectX-5 Ex VPI adapter card; EDR IB (100Gb/s).

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   1 +
 drivers/infiniband/hw/mlx5/mr.c      | 210 +++++++++++++++++++++------
 2 files changed, 163 insertions(+), 48 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e22eeceae9eb..6043a42e8dda 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -795,6 +795,7 @@ struct mlx5_mkey_cache_tree {
 	struct workqueue_struct *wq;
 	struct dentry		*root;
 	unsigned long		last_add;
+	struct delayed_work	remove_ent_dwork;
 };
 
 struct mlx5_ib_port_resources {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 7c67aa4f1f1e..916e80a276fb 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -438,6 +438,35 @@ static bool is_special_ent(int ent_flags)
 	       (MLX5_CACHE_ENTRY_FLAG_IMR_MTT | MLX5_CACHE_ENTRY_FLAG_IMR_KSM);
 }
 
+#define ent_is_tmp(ent) (ent->limit == 0 && !is_special_ent(ent->entry_flags))
+
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+{
+	struct mlx5r_cache_mkey *tmp_mkey, *mkey;
+	LIST_HEAD(del_list);
+
+	cancel_delayed_work(&ent->dwork);
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			break;
+		}
+		mkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey,
+					list);
+		list_move(&mkey->list, &del_list);
+		ent->available_mkeys--;
+		ent->total_mkeys--;
+		spin_unlock_irq(&ent->lock);
+		mlx5_core_destroy_mkey(dev->mdev, &mkey->key);
+	}
+
+	list_for_each_entry_safe(mkey, tmp_mkey, &del_list, list) {
+		list_del(&mkey->list);
+		kfree(mkey);
+	}
+}
+
 static bool someone_adding(struct mlx5_mkey_cache_tree *cache)
 {
 	struct mlx5_cache_ent *ent;
@@ -468,7 +497,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
 	lockdep_assert_held(&ent->lock);
 
-	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
+	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent_is_tmp(ent))
 		return;
 	if (ent->available_mkeys < ent->limit) {
 		ent->fill_to_high_water = true;
@@ -573,6 +602,39 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
+#define ent_is_default(ent) (ent->order != 0)
+
+static void remove_ent_work_func(struct work_struct *work)
+{
+	struct mlx5_mkey_cache_tree *cache;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *cur, *next;
+
+	cache = container_of(work, struct mlx5_mkey_cache_tree,
+			     remove_ent_dwork.work);
+	mutex_lock(&cache->cache_lock);
+	cur = rb_last(&cache->cache_root);
+	while (cur) {
+		ent = container_of(cur, struct mlx5_cache_ent, node);
+
+		if (is_special_ent(ent->entry_flags) || ent->limit != 0) {
+			cur = rb_prev(cur);
+			continue;
+		}
+
+		cancel_work_sync(&ent->work);
+		cancel_delayed_work_sync(&ent->dwork);
+		next = rb_prev(cur);
+		clean_keys(ent->dev, ent);
+		if (!ent_is_default(ent)) {
+			rb_erase(&ent->node, &cache->cache_root);
+			kfree(ent);
+		}
+		cur = next;
+	}
+	mutex_unlock(&cache->cache_lock);
+}
+
 static int mlx5_ent_access_flags(struct mlx5_ib_dev *dev, int access_flags)
 {
 	int ret = 0;
@@ -792,33 +854,6 @@ static int mlx5_free_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	return 0;
 }
 
-static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-{
-	struct mlx5r_cache_mkey *tmp_mkey, *mkey;
-	LIST_HEAD(del_list);
-
-	cancel_delayed_work(&ent->dwork);
-	while (1) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-			break;
-		}
-		mkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey,
-					list);
-		list_move(&mkey->list, &del_list);
-		ent->available_mkeys--;
-		ent->total_mkeys--;
-		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, &mkey->key);
-	}
-
-	list_for_each_entry_safe(mkey, tmp_mkey, &del_list, list) {
-		list_del(&mkey->list);
-		kfree(mkey);
-	}
-}
-
 static void mlx5_mkey_cache_tree_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
@@ -874,6 +909,7 @@ int mlx5_mkey_cache_tree_init(struct mlx5_ib_dev *dev)
 	mutex_init(&dev->slow_path_mutex);
 	mutex_init(&cache->cache_lock);
 	cache->cache_root = RB_ROOT;
+	INIT_DELAYED_WORK(&cache->remove_ent_dwork, remove_ent_work_func);
 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 	if (!cache->wq) {
 		mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -925,6 +961,7 @@ int mlx5_mkey_cache_tree_cleanup(struct mlx5_ib_dev *dev)
 	if (!dev->cache.wq)
 		return 0;
 
+	cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
 	mutex_lock(&dev->cache.cache_lock);
 	for (node = rb_first(root); node; node = rb_next(node)) {
 		ent = container_of(node, struct mlx5_cache_ent, node);
@@ -1080,6 +1117,31 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 	return PAGE_SIZE;
 }
 
+static bool get_temporary_cache_mkey(struct mlx5_cache_ent *ent,
+				     struct mlx5r_mkey *mkey)
+{
+	struct mlx5_ib_dev *dev = ent->dev;
+	struct mlx5r_cache_mkey *cmkey;
+
+	WARN_ON(!mutex_is_locked(&dev->cache.cache_lock));
+	spin_lock_irq(&ent->lock);
+	if (list_empty(&ent->head)) {
+		spin_unlock_irq(&ent->lock);
+		return false;
+	}
+
+	cmkey = list_first_entry(&ent->head, struct mlx5r_cache_mkey, list);
+	list_del(&cmkey->list);
+	ent->available_mkeys--;
+	ent->total_mkeys--;
+	spin_unlock_irq(&ent->lock);
+	queue_delayed_work(dev->cache.wq, &dev->cache.remove_ent_dwork,
+			   msecs_to_jiffies(30 * 1000));
+	mkey->key = cmkey->key;
+	kfree(cmkey);
+	return true;
+}
+
 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 					     struct ib_umem *umem, u64 iova,
 					     int access_flags)
@@ -1104,36 +1166,45 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	xlt_size = get_octo_len(iova, umem->length, order_base_2(page_size));
 	mutex_lock(&dev->cache.cache_lock);
 	ent = mkey_cache_ent_from_size(dev, ent_flags, xlt_size);
-	mutex_unlock(&dev->cache.cache_lock);
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
 	 * cache then synchronously create an uncached one.
 	 */
-	if (!ent || ent->limit == 0) {
-		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
-		mutex_unlock(&dev->slow_path_mutex);
-		return mr;
+	if (!ent || (ent_is_tmp(ent) && ent->xlt != xlt_size)) {
+		mutex_unlock(&dev->cache.cache_lock);
+		goto slow_path;
 	}
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
+	if (!mr) {
+		mutex_unlock(&dev->cache.cache_lock);
 		return ERR_PTR(-ENOMEM);
+	}
 
-	cmkey = get_cache_mkey(ent);
-	if (cmkey) {
-		mr->mmkey.key = cmkey->key;
-		mr->mmkey.cache_ent = cmkey->cache_ent;
-		kfree(cmkey);
-	} else {
-		ret = create_cacheable_mkey(ent, &mr->mmkey);
-		/*
-		 * The above already tried to do the same stuff as reg_create(),
-		 * no reason to try it again.
-		 */
-		if (ret) {
+	if (ent_is_tmp(ent)) {
+		ret = get_temporary_cache_mkey(ent, &mr->mmkey);
+		mutex_unlock(&dev->cache.cache_lock);
+		if (!ret) {
 			kfree(mr);
-			return ERR_PTR(ret);
+			goto slow_path;
+		}
+	} else {
+		mutex_unlock(&dev->cache.cache_lock);
+		cmkey = get_cache_mkey(ent);
+		if (cmkey) {
+			mr->mmkey.key = cmkey->key;
+			mr->mmkey.cache_ent = cmkey->cache_ent;
+			kfree(cmkey);
+		} else {
+			ret = create_cacheable_mkey(ent, &mr->mmkey);
+			/*
+			 * The above already tried to do the same stuff as
+			 * reg_create(), no reason to try it again.
+			 */
+			if (ret) {
+				kfree(mr);
+				return ERR_PTR(ret);
+			}
 		}
 	}
 	mr->ibmr.pd = pd;
@@ -1147,6 +1218,12 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	set_mr_fields(dev, mr, umem->length, access_flags);
 
 	return mr;
+
+slow_path:
+	mutex_lock(&dev->slow_path_mutex);
+	mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+	mutex_unlock(&dev->slow_path_mutex);
+	return mr;
 }
 
 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
@@ -2055,6 +2132,41 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 				   new_access_flags, udata);
 }
 
+static void insert_mkey_tmp_to_cache(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mkey_cache_tree *cache = &dev->cache;
+	struct ib_umem *umem = mr->umem;
+	struct mlx5_cache_ent *ent;
+	int ent_flags;
+	int xlt_size;
+
+	if (mr->mmkey.cache_ent)
+		return;
+	if (!umem || !mlx5_ib_can_load_pas_with_umr(dev, umem->length))
+		return;
+
+	ent_flags = mlx5_ent_access_flags(dev, mr->access_flags);
+	xlt_size = get_octo_len(umem->iova, umem->length, mr->page_shift);
+	mutex_lock(&cache->cache_lock);
+	queue_delayed_work(cache->wq, &cache->remove_ent_dwork,
+			   msecs_to_jiffies(30 * 1000));
+	ent = mkey_cache_ent_from_size(dev, ent_flags, xlt_size);
+	if (!ent || ent->xlt != xlt_size) {
+		mutex_unlock(&cache->cache_lock);
+		ent = mlx5_ib_create_cache_ent(dev, ent_flags, xlt_size, 0);
+		if (IS_ERR(ent))
+			return;
+		mutex_lock(&cache->cache_lock);
+	}
+
+	spin_lock_irq(&ent->lock);
+	ent->total_mkeys++;
+	spin_unlock_irq(&ent->lock);
+	mutex_unlock(&cache->cache_lock);
+	mr->mmkey.cache_ent = ent;
+}
+
 static int
 mlx5_alloc_priv_descs(struct ib_device *device,
 		      struct mlx5_ib_mr *mr,
@@ -2147,6 +2259,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 		mr->sig = NULL;
 	}
 
+	insert_mkey_tmp_to_cache(dev, mr);
+
 	/* Stop DMA */
 	if (mr->mmkey.cache_ent) {
 		if (revoke_mr(mr) || mlx5_free_mkey(dev, mr)) {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key
  2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
@ 2021-07-29 15:28   ` Jason Gunthorpe
  2021-07-29 17:27     ` Leon Romanovsky
  2021-07-29 18:08   ` Jason Gunthorpe
  1 sibling, 1 reply; 12+ messages in thread
From: Jason Gunthorpe @ 2021-07-29 15:28 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

On Tue, Jun 22, 2021 at 03:08:19PM +0300, Leon Romanovsky wrote:

> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
> index 50af84e76fb6..7a76b5eb1c1a 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
> @@ -35,13 +35,11 @@
>  #include <linux/mlx5/driver.h>
>  #include "mlx5_core.h"
>  
> -int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
> -			  struct mlx5_core_mkey *mkey,
> -			  u32 *in, int inlen)
> +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
> +			  int inlen)
>  {
>  	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
>  	u32 mkey_index;
> -	void *mkc;
>  	int err;
>  
>  	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
> @@ -50,38 +48,32 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
>  	if (err)
>  		return err;
>  
> -	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>  	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
> -	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
> -	mkey->size = MLX5_GET64(mkc, mkc, len);
> -	mkey->key |= mlx5_idx_to_mkey(mkey_index);
> -	mkey->pd = MLX5_GET(mkc, mkc, pd);
> -	init_waitqueue_head(&mkey->wait);
> +	*mkey |= mlx5_idx_to_mkey(mkey_index);


This conflicts with 0232fc2ddcf4 ("net/mlx5: Reset mkey index on creation")

Please resend/rebase. I think it should be fixed like

	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
	*mkey = (u32)mlx5_mkey_variant(mkey->key) | mlx5_idx_to_mkey(mkey_index);

	mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index,	*mkey);
?

(though I will look at the rest of the series today, so don't rush on
this)

Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key
  2021-07-29 15:28   ` Jason Gunthorpe
@ 2021-07-29 17:27     ` Leon Romanovsky
  0 siblings, 0 replies; 12+ messages in thread
From: Leon Romanovsky @ 2021-07-29 17:27 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

On Thu, Jul 29, 2021 at 12:28:03PM -0300, Jason Gunthorpe wrote:
> On Tue, Jun 22, 2021 at 03:08:19PM +0300, Leon Romanovsky wrote:
> 
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
> > index 50af84e76fb6..7a76b5eb1c1a 100644
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
> > @@ -35,13 +35,11 @@
> >  #include <linux/mlx5/driver.h>
> >  #include "mlx5_core.h"
> >  
> > -int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
> > -			  struct mlx5_core_mkey *mkey,
> > -			  u32 *in, int inlen)
> > +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
> > +			  int inlen)
> >  {
> >  	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
> >  	u32 mkey_index;
> > -	void *mkc;
> >  	int err;
> >  
> >  	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
> > @@ -50,38 +48,32 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
> >  	if (err)
> >  		return err;
> >  
> > -	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
> >  	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
> > -	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
> > -	mkey->size = MLX5_GET64(mkc, mkc, len);
> > -	mkey->key |= mlx5_idx_to_mkey(mkey_index);
> > -	mkey->pd = MLX5_GET(mkc, mkc, pd);
> > -	init_waitqueue_head(&mkey->wait);
> > +	*mkey |= mlx5_idx_to_mkey(mkey_index);
> 
> 
> This conflicts with 0232fc2ddcf4 ("net/mlx5: Reset mkey index on creation")
> 
> Please resend/rebase. I think it should be fixed like
> 
> 	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
> 	*mkey = (u32)mlx5_mkey_variant(mkey->key) | mlx5_idx_to_mkey(mkey_index);
> 
> 	mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index,	*mkey);
> ?

Yes, this is how it is fixed in my tree. I just waited till you finish the review.

> 
> (though I will look at the rest of the series today, so don't rush on
> this)
> 
> Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key
  2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
  2021-07-29 15:28   ` Jason Gunthorpe
@ 2021-07-29 18:08   ` Jason Gunthorpe
  1 sibling, 0 replies; 12+ messages in thread
From: Jason Gunthorpe @ 2021-07-29 18:08 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

On Tue, Jun 22, 2021 at 03:08:19PM +0300, Leon Romanovsky wrote:

> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index 03dc6c22843f..ae0472d92801 100644
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -89,24 +89,39 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
>  	MLX5_SET64(mkc, mkc, start_addr, start_addr);
>  }
>  
> -static void
> -assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
> -		    u32 *in)
> +static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
>  {
>  	u8 key = atomic_inc_return(&dev->mkey_var);
>  	void *mkc;
>  
>  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>  	MLX5_SET(mkc, mkc, mkey_7_0, key);
> -	mkey->key = key;
> +	*mkey = key;
> +}

Can this be tidied please? We set both mkey_7_0 and mkey then pass
them into mlx5_core_create_mkey which then does

	*mkey = (u32)mlx5_mkey_variant(*mkey) | mlx5_idx_to_mkey(mkey_index);

But isn't mlx5_mkey_variant(*mkey) just MLX5_GET(mkc, in, mkey_7_0)
and we can get rid of this confusing sequence?

> +
> +static void set_mkey_fields(void *mkc, struct mlx5_core_mkey *mkey)
> +{
> +	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
> +	mkey->size = MLX5_GET64(mkc, mkc, len);
> +	mkey->pd = MLX5_GET(mkc, mkc, pd);
> +	init_waitqueue_head(&mkey->wait);
>  }

Why isn't this called through the create_mkey_callback() path? I think
evey mkey should always have a valid waitqueue

Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib
  2021-06-22 12:08 ` [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib Leon Romanovsky
@ 2021-07-29 18:39   ` Jason Gunthorpe
  0 siblings, 0 replies; 12+ messages in thread
From: Jason Gunthorpe @ 2021-07-29 18:39 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

On Tue, Jun 22, 2021 at 03:08:20PM +0300, Leon Romanovsky wrote:

> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index 7bb35a3d8004..af11a0d8ebc0 100644
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -634,9 +634,19 @@ struct mlx5_user_mmap_entry {
>  #define mlx5_update_odp_stats(mr, counter_name, value)		\
>  	atomic64_add(value, &((mr)->odp_stats.counter_name))
>  
> +struct mlx5r_mkey {

Not mlx5_ib_mkey? mlx5_ib_odp_mkey might capture the intention of
what this is actually for.

> +	u64			iova;

IOVA is already stored in ib_mr->iova, no need to duplicate it here.

> +	u64			size;

Only one place reads size in mlx5_ib_create_xlt_wr(), and it can be
mr->ibmr.length, so delete size

> +	u32			key;
> +	u32			pd;

Lots of places write to this but nothing reads it, delete it.

> +	u32			type;

Please drop the horizontal spacing

type should be a proper enum not u32 and the values should be moved
out of driver.h as well.

ndescs should probably be added here instead of in the containing
structs since ODP needs it generically.

This patch and the one before are a good cleanup on their own so they
can get applied when they are fixed up enough. Each of the above
changes to remove fields in the mlx5r_mkey struct should be single
patches so this will little series will grow two more patches.

Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs
  2021-06-22 12:08 ` [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs Leon Romanovsky
@ 2021-07-29 19:08   ` Jason Gunthorpe
  0 siblings, 0 replies; 12+ messages in thread
From: Jason Gunthorpe @ 2021-07-29 19:08 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization

On Tue, Jun 22, 2021 at 03:08:21PM +0300, Leon Romanovsky wrote:
> From: Aharon Landau <aharonl@nvidia.com>
> 
> Today the cache is an MR-cache, however, all members of MR, except for
> mkey, are not being used in the cache.
> Therefore, changing it to an mkey-cache so that the cache has its own
> memory and holds only the values needed for the cache.

This patch is quite big and seems to be doing a lot more than just
this

Frankly, I'm not sure what it is trying to do

> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index af11a0d8ebc0..ffb6f1d41f3d 100644
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -634,6 +634,15 @@ struct mlx5_user_mmap_entry {
>  #define mlx5_update_odp_stats(mr, counter_name, value)		\
>  	atomic64_add(value, &((mr)->odp_stats.counter_name))
>  
> +struct mlx5r_cache_mkey {
> +	u32 key;
> +	struct mlx5_cache_ent *cache_ent;
> +	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
> +	struct mlx5_async_work cb_work;
> +	/* Cache list element */
> +	struct list_head list;
> +};

This is the point, right? Lift these members out of the mlx5_ib_mr?

But out abd cb_work shouldn't be stored in perpetuity in the cache, it
is only needed short-term as part of the callback for async mkey
creation.

This should also be organized to not have so many alignment holes

Actually the only thing it does is store a u32 attached to each rbtree
so this looks like rather a lot of memory overhead, plus the
kfree/allocs.

I'd probably do this with an xarray on the mlx5_cache_ent
instead. Store the 'tail index' and adding is
'xa_insert(tail_index++)' and removing is 'xa_erase(tail_index--)'

Use xa_mk_value() and I think we have less than 31 bits of mkey,
right?

>  static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
>  {
>  	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
> @@ -763,16 +758,16 @@ struct mlx5_cache_ent {
>  	u8 fill_to_high_water:1;
>  
>  	/*
> -	 * - available_mrs is the length of list head, ie the number of MRs
> +	 * - available_mkeys is the length of list head, ie the number of Mkeys
>  	 *   available for immediate allocation.
> -	 * - total_mrs is available_mrs plus all in use MRs that could be
> +	 * - total_mkeys is available_mkeys plus all in use Mkeys that could be
>  	 *   returned to the cache.
> -	 * - limit is the low water mark for available_mrs, 2* limit is the
> +	 * - limit is the low water mark for available_mkeys, 2* limit is the
>  	 *   upper water mark.
> -	 * - pending is the number of MRs currently being created
> +	 * - pending is the number of Mkeys currently being created
>  	 */
> -	u32 total_mrs;
> -	u32 available_mrs;
> +	u32 total_mkeys;
> +	u32 available_mkeys;
>  	u32 limit;
>  	u32 pending;

Put all the renaming in another patch, maybe as the last patch in the
series and do everything. Much too hard to read when renaming is
muddled with logic changes

Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree
  2021-06-22 12:08 ` [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree Leon Romanovsky
@ 2021-07-29 19:45   ` Jason Gunthorpe
  0 siblings, 0 replies; 12+ messages in thread
From: Jason Gunthorpe @ 2021-07-29 19:45 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Aharon Landau, Jakub Kicinski, Jason Wang,
	linux-kernel, linux-rdma, Michael S. Tsirkin, netdev,
	Saeed Mahameed, Shay Drory, virtualization


> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index ffb6f1d41f3d..e22eeceae9eb 100644
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -749,7 +749,7 @@ struct mlx5_cache_ent {
>  
>  
>  	char                    name[4];
> -	u32                     order;
> +	u32			order;
>  	u32			xlt;
>  	u32			access_mode;
>  	u32			page;

Looking at this, it looks like it will be a lot simpler to just store
the reference mkc here and use that whole blob as the rb key and write
a slightly special compare function.. Maybe only the ndescs needs to
be stored loose.

Then all the weirdness about special ents disappears, they are
naturally handled by having the required bits in their mkc.

And all the random encode/decode/recode scattered all over the place
goes away. Anyone working with mkeys needs to build a mkc on their
stack, then check if allocation of that mkc can be satisfied with the
cache, otherwise pass that same mkc to the alloc cmd. The one case
that uses the PAS will have to alloc a new mkc and memcpy, but that is
OK.

> +static struct mlx5_cache_ent *mkey_cache_ent_from_size(struct mlx5_ib_dev *dev,
> +						       int ent_flags, int size)
> +{
> +	struct rb_node *node = dev->cache.cache_root.rb_node;
> +	struct mlx5_cache_ent *cur, *prev = NULL;
> +
> +	WARN_ON(!mutex_is_locked(&dev->cache.cache_lock));

Yikes, no, use lockdep.

> @@ -616,13 +739,18 @@ struct mlx5_ib_mr *mlx5_alloc_special_mkey(struct mlx5_ib_dev *dev,
>  static struct mlx5r_cache_mkey *get_cache_mkey(struct mlx5_cache_ent *req_ent)
>  {
>  	struct mlx5_ib_dev *dev = req_ent->dev;
> -	struct mlx5_cache_ent *ent = req_ent;
>  	struct mlx5r_cache_mkey *cmkey;
> +	struct mlx5_cache_ent *ent;
> +	struct rb_node *node;
>  
>  	/* Try larger Mkey pools from the cache to satisfy the allocation */
> -	for (; ent != &dev->cache.ent[MKEY_CACHE_LAST_STD_ENTRY + 1]; ent++) {
> -		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
> -			    ent - dev->cache.ent);
> +	mutex_lock(&dev->cache.cache_lock);
> +	for (node = &req_ent->node; node; node = rb_next(node)) {
> +		ent = container_of(node, struct mlx5_cache_ent, node);

See, this should be 'search for the mkc I have for the lowest entry with size+1'

> -int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
> +int mlx5_mkey_cache_tree_init(struct mlx5_ib_dev *dev)
>  {
> -	struct mlx5_mkey_cache *cache = &dev->cache;
> +	struct mlx5_mkey_cache_tree *cache = &dev->cache;
>  	struct mlx5_cache_ent *ent;
> +	int err;
>  	int i;
>  
>  	mutex_init(&dev->slow_path_mutex);
> +	mutex_init(&cache->cache_lock);
> +	cache->cache_root = RB_ROOT;
>  	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
>  	if (!cache->wq) {
>  		mlx5_ib_warn(dev, "failed to create work queue\n");
> @@ -745,28 +882,25 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
>  
>  	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
>  	timer_setup(&dev->delay_timer, delay_time_func, 0);
> -	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
> -		ent = &cache->ent[i];
> -		INIT_LIST_HEAD(&ent->head);
> -		spin_lock_init(&ent->lock);
> -		ent->order = i + 2;
> -		ent->dev = dev;
> -		ent->limit = 0;
> -
> -		INIT_WORK(&ent->work, cache_work_func);
> -		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
> +	for (i = 0; i < MAX_MKEY_CACHE_DEFAULT_ENTRIES; i++) {
> +		u8 order = i + 2;
> +		u32 xlt_size = (1 << order) * sizeof(struct mlx5_mtt) /
> +			       MLX5_IB_UMR_OCTOWORD;

This should be written saner

for (xlt_size = MKEY_CACHE_DEFAULT_MIN_DESCS * sizeof(struct mlx5_mtt) / MLX5_IB_UMR_OCTOWORD; 
     xlt_size <= MKEY_CACHE_DEFAULT_MAX_DESCS *  sizeof(struct mlx5_mtt) / MLX5_IB_UMR_OCTOWORD; 
     xlt_size *= 2)

>  
>  		if (i > MKEY_CACHE_LAST_STD_ENTRY) {

The index in the cache should be meaningless now, so don't put this
code here.

> -			mlx5_odp_init_mkey_cache_entry(ent);
> +			err = mlx5_odp_init_mkey_cache_entry(dev, i);
> +			if (err)
> +				return err;
>  			continue;
>  		}

> -		if (ent->order > mkey_cache_max_order(dev))
> +		ent = mlx5_ib_create_cache_ent(dev, 0, xlt_size, order);

And why do we need to pass in order, why is it stored in the
cache_ent? Looks like it should be removed

The debugfs looks like it might need some rethink as is it can only
control the original buckets, the new buckets don't get exposed. Seems
like trouble.

If just exposing the legacy things is the idea then it should have the
same sweep over the parameter space as above, not just assume that the
rb tree is in order and only contains debugfs entries.

Probably change it to create the debugfs nodes at the same time the
cache entry itself is created.

> @@ -973,14 +1100,16 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
>  						     0, iova);
>  	if (WARN_ON(!page_size))
>  		return ERR_PTR(-EINVAL);
> -	ent = mkey_cache_ent_from_order(
> -		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
> +	ent_flags = mlx5_ent_access_flags(dev, access_flags);
> +	xlt_size = get_octo_len(iova, umem->length, order_base_2(page_size));
> +	mutex_lock(&dev->cache.cache_lock);
> +	ent = mkey_cache_ent_from_size(dev, ent_flags, xlt_size);

See here is where I wonder if it is just better to build the mkc on
the stack in one place instead of having all this stuff open coded all
over..

> -void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
> +int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev, int ent_num)
>  {
> -	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
> -		return;
> +	struct mlx5_cache_ent *ent;
> +	int ent_flags;
> +	u32 xlt_size;
> +
> +	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
> +		return 0;
>  
> -	switch (ent->order - 2) {
> +	switch (ent_num) {
>  	case MLX5_IMR_MTT_CACHE_ENTRY:

Don't do stuff like this either. The mkc scheme will fix this too as
this will just create two mkcs for these unique usages and store them
normally.

Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-07-29 19:45 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-22 12:08 [PATCH rdma-next 0/5] mlx5 MR cache enhancements Leon Romanovsky
2021-06-22 12:08 ` [PATCH mlx5-next 1/5] RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key Leon Romanovsky
2021-07-29 15:28   ` Jason Gunthorpe
2021-07-29 17:27     ` Leon Romanovsky
2021-07-29 18:08   ` Jason Gunthorpe
2021-06-22 12:08 ` [PATCH mlx5-next 2/5] RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib Leon Romanovsky
2021-07-29 18:39   ` Jason Gunthorpe
2021-06-22 12:08 ` [PATCH mlx5-next 3/5] RDMA/mlx5: Change the cache to hold mkeys instead of MRs Leon Romanovsky
2021-07-29 19:08   ` Jason Gunthorpe
2021-06-22 12:08 ` [PATCH mlx5-next 4/5] RDMA/mlx5: Change the cache structure to an rbtree Leon Romanovsky
2021-07-29 19:45   ` Jason Gunthorpe
2021-06-22 12:08 ` [PATCH rdma-next 5/5] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky

This is a public inbox, see mirroring instructions
on how to clone and mirror all data and code used for this inbox