Netdev Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Alexander Duyck <alexander.duyck@gmail.com>
To: Yunsheng Lin <linyunsheng@huawei.com>
Cc: David Miller <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Russell King - ARM Linux <linux@armlinux.org.uk>,
	Marcin Wojtas <mw@semihalf.com>,
	linuxarm@openeuler.org, yisen.zhuang@huawei.com,
	Salil Mehta <salil.mehta@huawei.com>,
	thomas.petazzoni@bootlin.com, hawk@kernel.org,
	Ilias Apalodimas <ilias.apalodimas@linaro.org>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	John Fastabend <john.fastabend@gmail.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Will Deacon <will@kernel.org>,
	Matthew Wilcox <willy@infradead.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	fenghua.yu@intel.com, guro@fb.com, Peter Xu <peterx@redhat.com>,
	Feng Tang <feng.tang@intel.com>, Jason Gunthorpe <jgg@ziepe.ca>,
	Matteo Croce <mcroce@microsoft.com>,
	Hugh Dickins <hughd@google.com>,
	Jonathan Lemon <jonathan.lemon@gmail.com>,
	Alexander Lobakin <alobakin@pm.me>,
	Willem de Bruijn <willemb@google.com>,
	wenxu@ucloud.cn, Cong Wang <cong.wang@bytedance.com>,
	Kevin Hao <haokexin@gmail.com>,
	nogikh@google.com, Marco Elver <elver@google.com>,
	Netdev <netdev@vger.kernel.org>,
	LKML <linux-kernel@vger.kernel.org>, bpf <bpf@vger.kernel.org>
Subject: Re: [PATCH rfc v2 4/5] page_pool: support page frag API for page pool
Date: Sat, 10 Jul 2021 10:43:12 -0700	[thread overview]
Message-ID: <CAKgT0UeDf1+-CjzsCo0Chtd2kn_mFV7=my1ygeKMBHBSdjrAHQ@mail.gmail.com> (raw)
In-Reply-To: <1625903002-31619-5-git-send-email-linyunsheng@huawei.com>

On Sat, Jul 10, 2021 at 12:44 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Currently each desc use a whole page to do ping-pong page
> reusing in most driver. As the page pool has support page
> recycling based on elevated refcnt, it makes sense to add
> a page frag API in page pool to split a page to different
> frag to serve multi descriptions.
>
> This means a huge memory saving for kernel with page size of
> 64K, as a page can be used by 32 descriptions with 2k buffer
> size, comparing to each desc using one page currently.
>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/net/page_pool.h | 14 ++++++++++++++
>  net/core/page_pool.c    | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 63 insertions(+)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index f0e708d..06a5e43 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -80,6 +80,7 @@ struct page_pool_params {
>         enum dma_data_direction dma_dir; /* DMA mapping direction */
>         unsigned int    max_len; /* max DMA sync memory size */
>         unsigned int    offset;  /* DMA addr offset */
> +       unsigned int    frag_size;
>  };
>
>  struct page_pool {
> @@ -91,6 +92,8 @@ struct page_pool {
>         unsigned long defer_warn;
>
>         u32 pages_state_hold_cnt;
> +       unsigned int frag_offset;
> +       struct page *frag_page;
>
>         /*
>          * Data structure for allocation side
> @@ -140,6 +143,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
>         return page_pool_alloc_pages(pool, gfp);
>  }
>
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> +                                 unsigned int *offset, gfp_t gfp);
> +
> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
> +                                                   unsigned int *offset)
> +{
> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
> +
> +       return page_pool_alloc_frag(pool, offset, gfp);
> +}
> +
>  /* get the stored dma direction. A driver might decide to treat this locally and
>   * avoid the extra cache line from page_pool to determine the direction
>   */
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index a87cbe1..b787033 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -350,6 +350,53 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
>  }
>  EXPORT_SYMBOL(page_pool_alloc_pages);
>
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> +                                 unsigned int *offset, gfp_t gfp)
> +{
> +       unsigned int frag_offset = pool->frag_offset;
> +       unsigned int frag_size = pool->p.frag_size;
> +       struct page *frag_page = pool->frag_page;
> +       unsigned int max_len = pool->p.max_len;
> +
> +       if (!frag_page || frag_offset + frag_size > max_len) {
> +               frag_page = page_pool_alloc_pages(pool, gfp);
> +               if (unlikely(!frag_page)) {
> +                       pool->frag_page = NULL;
> +                       return NULL;
> +               }
> +
> +               pool->frag_page = frag_page;
> +               frag_offset = 0;
> +
> +               page_pool_sub_bias(pool, frag_page,
> +                                  max_len / frag_size - 1);
> +       }
> +
> +       *offset = frag_offset;
> +       pool->frag_offset = frag_offset + frag_size;
> +
> +       return frag_page;
> +}
> +EXPORT_SYMBOL(page_pool_alloc_frag);

I'm still not a fan of the fixed implementation. For the cost of the
division as I said before you could make this flexible like
page_frag_alloc_align and just decrement the bias by one per
allocation instead of trying to batch it.

I'm sure there would likely be implementations that might need to
operate at two different sizes, for example a header and payload size.

> +static void page_pool_empty_frag(struct page_pool *pool)
> +{
> +       unsigned int frag_offset = pool->frag_offset;
> +       unsigned int frag_size = pool->p.frag_size;
> +       struct page *frag_page = pool->frag_page;
> +       unsigned int max_len = pool->p.max_len;
> +
> +       if (!frag_page)
> +               return;
> +
> +       while (frag_offset + frag_size <= max_len) {
> +               page_pool_put_full_page(pool, frag_page, false);
> +               frag_offset += frag_size;
> +       }
> +

Having to call this to free the page seems confusing. Rather than
reserving multiple and having to free the page multiple times I really
think you would be better off just holding one bias reservation on the
page at a time.

> +       pool->frag_page = NULL;
> +}
> +
>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)
>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
>   */
> @@ -670,6 +717,8 @@ void page_pool_destroy(struct page_pool *pool)
>         if (!page_pool_put(pool))
>                 return;
>
> +       page_pool_empty_frag(pool);
> +
>         if (!page_pool_release(pool))
>                 return;
>
> --
> 2.7.4
>

  reply	other threads:[~2021-07-10 17:43 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-10  7:43 [PATCH rfc v2 0/5] add elevated refcnt support " Yunsheng Lin
2021-07-10  7:43 ` [PATCH rfc v2 1/5] page_pool: keep pp info as long as page pool owns the page Yunsheng Lin
2021-07-10  7:43 ` [PATCH rfc v2 2/5] page_pool: add interface for getting and setting pagecnt_bias Yunsheng Lin
2021-07-10 16:55   ` Alexander Duyck
2021-07-12  7:44     ` Yunsheng Lin
2021-07-10  7:43 ` [PATCH rfc v2 3/5] page_pool: add page recycling support based on elevated refcnt Yunsheng Lin
2021-07-10 17:24   ` Alexander Duyck
2021-07-12  7:54     ` Yunsheng Lin
2021-07-10 17:31   ` Alexander Duyck
2021-07-12  2:06     ` Yunsheng Lin
2021-07-12  3:30       ` Alexander Duyck
2021-07-10  7:43 ` [PATCH rfc v2 4/5] page_pool: support page frag API for page pool Yunsheng Lin
2021-07-10 17:43   ` Alexander Duyck [this message]
2021-07-12  7:57     ` Yunsheng Lin
2021-07-10  7:43 ` [PATCH rfc v2 5/5] net: hns3: support skb's frag page recycling based on " Yunsheng Lin
2021-07-10 15:40 ` [PATCH rfc v2 0/5] add elevated refcnt support for " Matteo Croce

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAKgT0UeDf1+-CjzsCo0Chtd2kn_mFV7=my1ygeKMBHBSdjrAHQ@mail.gmail.com' \
    --to=alexander.duyck@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=alobakin@pm.me \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=cong.wang@bytedance.com \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=elver@google.com \
    --cc=feng.tang@intel.com \
    --cc=fenghua.yu@intel.com \
    --cc=guro@fb.com \
    --cc=haokexin@gmail.com \
    --cc=hawk@kernel.org \
    --cc=hughd@google.com \
    --cc=ilias.apalodimas@linaro.org \
    --cc=jgg@ziepe.ca \
    --cc=john.fastabend@gmail.com \
    --cc=jonathan.lemon@gmail.com \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@armlinux.org.uk \
    --cc=linuxarm@openeuler.org \
    --cc=linyunsheng@huawei.com \
    --cc=mcroce@microsoft.com \
    --cc=mw@semihalf.com \
    --cc=netdev@vger.kernel.org \
    --cc=nogikh@google.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=salil.mehta@huawei.com \
    --cc=thomas.petazzoni@bootlin.com \
    --cc=vbabka@suse.cz \
    --cc=wenxu@ucloud.cn \
    --cc=will@kernel.org \
    --cc=willemb@google.com \
    --cc=willy@infradead.org \
    --cc=yisen.zhuang@huawei.com \
    --subject='Re: [PATCH rfc v2 4/5] page_pool: support page frag API for page pool' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).