Netdev Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH net-next] tcp: tweak len/truesize ratio for coalesce candidates
@ 2021-07-21 10:15 Eric Dumazet
  2021-07-21 13:26 ` Soheil Hassas Yeganeh
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2021-07-21 10:15 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, Eric Dumazet, Soheil Hassas Yeganeh,
	Neal Cardwell, Yuchung Cheng

From: Eric Dumazet <edumazet@google.com>

tcp_grow_window() is using skb->len/skb->truesize to increase tp->rcv_ssthresh
which has a direct impact on advertized window sizes.

We added TCP coalescing in linux-3.4 & linux-3.5:

Instead of storing skbs with one or two MSS in receive queue (or OFO queue),
we try to append segments together to reduce memory overhead.

High performance network drivers tend to cook skb with 3 parts :

1) sk_buff structure (256 bytes)
2) skb->head contains room to copy headers as needed, and skb_shared_info
3) page fragment(s) containing the ~1514 bytes frame (or more depending on MTU)

Once coalesced into a previous skb, 1) and 2) are freed.

We can therefore tweak the way we compute len/truesize ratio knowing
that skb->truesize is inflated by 1) and 2) soon to be freed.

This is done only for in-order skb, or skb coalesced into OFO queue.

The result is that low rate flows no longer pay the memory price of having
low GRO aggregation factor. Same result for drivers not using GRO.

This is critical to allow a big enough receiver window,
typically tcp_rmem[2] / 2.

We have been using this at Google for about 5 years, it is due time
to make it upstream.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_input.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bef2c8b64d83a0f3d4cca90f9b12912bf3d00807..501d8d4d4ba46f9a5de322ab690c320757e0990c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -454,11 +454,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
  */
 
 /* Slow part of check#2. */
-static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
+			     unsigned int skbtruesize)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
-	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
+	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
 	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
 
 	while (tp->rcv_ssthresh <= window) {
@@ -471,7 +472,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 	return 0;
 }
 
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
+ * can play nice with us, as sk_buff and skb->head might be either
+ * freed or shared with up to MAX_SKB_FRAGS segments.
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
+ * and if no payload was pulled in skb->head before reaching us.
+ */
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
+{
+	u32 truesize = skb->truesize;
+
+	if (adjust && !skb_headlen(skb)) {
+		truesize -= SKB_TRUESIZE(skb_end_offset(skb));
+		/* paranoid check, some drivers might be buggy */
+		if (unlikely((int)truesize < (int)skb->len))
+			truesize = skb->truesize;
+	}
+	return truesize;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
+			    bool adjust)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int room;
@@ -480,15 +501,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 
 	/* Check #1 */
 	if (room > 0 && !tcp_under_memory_pressure(sk)) {
+		unsigned int truesize = truesize_adjust(adjust, skb);
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
 		 * will fit to rcvbuf in future.
 		 */
-		if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
+		if (tcp_win_from_space(sk, truesize) <= skb->len)
 			incr = 2 * tp->advmss;
 		else
-			incr = __tcp_grow_window(sk, skb);
+			incr = __tcp_grow_window(sk, skb, truesize);
 
 		if (incr) {
 			incr = max_t(int, incr, 2 * skb->len);
@@ -782,7 +804,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 	tcp_ecn_check_ce(sk, skb);
 
 	if (skb->len >= 128)
-		tcp_grow_window(sk, skb);
+		tcp_grow_window(sk, skb, true);
 }
 
 /* Called to compute a smoothed rtt estimate. The data fed to this
@@ -4769,7 +4791,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		 * and trigger fast retransmit.
 		 */
 		if (tcp_is_sack(tp))
-			tcp_grow_window(sk, skb);
+			tcp_grow_window(sk, skb, true);
 		kfree_skb_partial(skb, fragstolen);
 		skb = NULL;
 		goto add_sack;
@@ -4857,7 +4879,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		 * and trigger fast retransmit.
 		 */
 		if (tcp_is_sack(tp))
-			tcp_grow_window(sk, skb);
+			tcp_grow_window(sk, skb, false);
 		skb_condense(skb);
 		skb_set_owner_r(skb, sk);
 	}
-- 
2.32.0.402.g57bb445576-goog


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next] tcp: tweak len/truesize ratio for coalesce candidates
  2021-07-21 10:15 [PATCH net-next] tcp: tweak len/truesize ratio for coalesce candidates Eric Dumazet
@ 2021-07-21 13:26 ` Soheil Hassas Yeganeh
  2021-07-21 16:24   ` Yuchung Cheng
  0 siblings, 1 reply; 3+ messages in thread
From: Soheil Hassas Yeganeh @ 2021-07-21 13:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet,
	Neal Cardwell, Yuchung Cheng

On Wed, Jul 21, 2021 at 6:15 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
>
> tcp_grow_window() is using skb->len/skb->truesize to increase tp->rcv_ssthresh
> which has a direct impact on advertized window sizes.
>
> We added TCP coalescing in linux-3.4 & linux-3.5:
>
> Instead of storing skbs with one or two MSS in receive queue (or OFO queue),
> we try to append segments together to reduce memory overhead.
>
> High performance network drivers tend to cook skb with 3 parts :
>
> 1) sk_buff structure (256 bytes)
> 2) skb->head contains room to copy headers as needed, and skb_shared_info
> 3) page fragment(s) containing the ~1514 bytes frame (or more depending on MTU)
>
> Once coalesced into a previous skb, 1) and 2) are freed.
>
> We can therefore tweak the way we compute len/truesize ratio knowing
> that skb->truesize is inflated by 1) and 2) soon to be freed.
>
> This is done only for in-order skb, or skb coalesced into OFO queue.
>
> The result is that low rate flows no longer pay the memory price of having
> low GRO aggregation factor. Same result for drivers not using GRO.
>
> This is critical to allow a big enough receiver window,
> typically tcp_rmem[2] / 2.
>
> We have been using this at Google for about 5 years, it is due time
> to make it upstream.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

Thank you, Eric!

> ---
>  net/ipv4/tcp_input.c | 38 ++++++++++++++++++++++++++++++--------
>  1 file changed, 30 insertions(+), 8 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index bef2c8b64d83a0f3d4cca90f9b12912bf3d00807..501d8d4d4ba46f9a5de322ab690c320757e0990c 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -454,11 +454,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
>   */
>
>  /* Slow part of check#2. */
> -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
> +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
> +                            unsigned int skbtruesize)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         /* Optimize this! */
> -       int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
> +       int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
>         int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
>
>         while (tp->rcv_ssthresh <= window) {
> @@ -471,7 +472,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
>         return 0;
>  }
>
> -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
> +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
> + * can play nice with us, as sk_buff and skb->head might be either
> + * freed or shared with up to MAX_SKB_FRAGS segments.
> + * Only give a boost to drivers using page frag(s) to hold the frame(s),
> + * and if no payload was pulled in skb->head before reaching us.
> + */
> +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
> +{
> +       u32 truesize = skb->truesize;
> +
> +       if (adjust && !skb_headlen(skb)) {
> +               truesize -= SKB_TRUESIZE(skb_end_offset(skb));
> +               /* paranoid check, some drivers might be buggy */
> +               if (unlikely((int)truesize < (int)skb->len))
> +                       truesize = skb->truesize;
> +       }
> +       return truesize;
> +}
> +
> +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
> +                           bool adjust)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         int room;
> @@ -480,15 +501,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
>
>         /* Check #1 */
>         if (room > 0 && !tcp_under_memory_pressure(sk)) {
> +               unsigned int truesize = truesize_adjust(adjust, skb);
>                 int incr;
>
>                 /* Check #2. Increase window, if skb with such overhead
>                  * will fit to rcvbuf in future.
>                  */
> -               if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
> +               if (tcp_win_from_space(sk, truesize) <= skb->len)
>                         incr = 2 * tp->advmss;
>                 else
> -                       incr = __tcp_grow_window(sk, skb);
> +                       incr = __tcp_grow_window(sk, skb, truesize);
>
>                 if (incr) {
>                         incr = max_t(int, incr, 2 * skb->len);
> @@ -782,7 +804,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
>         tcp_ecn_check_ce(sk, skb);
>
>         if (skb->len >= 128)
> -               tcp_grow_window(sk, skb);
> +               tcp_grow_window(sk, skb, true);
>  }
>
>  /* Called to compute a smoothed rtt estimate. The data fed to this
> @@ -4769,7 +4791,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>                  * and trigger fast retransmit.
>                  */
>                 if (tcp_is_sack(tp))
> -                       tcp_grow_window(sk, skb);
> +                       tcp_grow_window(sk, skb, true);
>                 kfree_skb_partial(skb, fragstolen);
>                 skb = NULL;
>                 goto add_sack;
> @@ -4857,7 +4879,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>                  * and trigger fast retransmit.
>                  */
>                 if (tcp_is_sack(tp))
> -                       tcp_grow_window(sk, skb);
> +                       tcp_grow_window(sk, skb, false);
>                 skb_condense(skb);
>                 skb_set_owner_r(skb, sk);
>         }
> --
> 2.32.0.402.g57bb445576-goog
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next] tcp: tweak len/truesize ratio for coalesce candidates
  2021-07-21 13:26 ` Soheil Hassas Yeganeh
@ 2021-07-21 16:24   ` Yuchung Cheng
  0 siblings, 0 replies; 3+ messages in thread
From: Yuchung Cheng @ 2021-07-21 16:24 UTC (permalink / raw)
  To: Soheil Hassas Yeganeh
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev,
	Eric Dumazet, Neal Cardwell

On Wed, Jul 21, 2021 at 9:27 AM Soheil Hassas Yeganeh <soheil@google.com> wrote:
>
> On Wed, Jul 21, 2021 at 6:15 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > From: Eric Dumazet <edumazet@google.com>
> >
> > tcp_grow_window() is using skb->len/skb->truesize to increase tp->rcv_ssthresh
> > which has a direct impact on advertized window sizes.
> >
> > We added TCP coalescing in linux-3.4 & linux-3.5:
> >
> > Instead of storing skbs with one or two MSS in receive queue (or OFO queue),
> > we try to append segments together to reduce memory overhead.
> >
> > High performance network drivers tend to cook skb with 3 parts :
> >
> > 1) sk_buff structure (256 bytes)
> > 2) skb->head contains room to copy headers as needed, and skb_shared_info
> > 3) page fragment(s) containing the ~1514 bytes frame (or more depending on MTU)
> >
> > Once coalesced into a previous skb, 1) and 2) are freed.
> >
> > We can therefore tweak the way we compute len/truesize ratio knowing
> > that skb->truesize is inflated by 1) and 2) soon to be freed.
> >
> > This is done only for in-order skb, or skb coalesced into OFO queue.
> >
> > The result is that low rate flows no longer pay the memory price of having
> > low GRO aggregation factor. Same result for drivers not using GRO.
> >
> > This is critical to allow a big enough receiver window,
> > typically tcp_rmem[2] / 2.
> >
> > We have been using this at Google for about 5 years, it is due time
> > to make it upstream.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Cc: Soheil Hassas Yeganeh <soheil@google.com>
> > Cc: Neal Cardwell <ncardwell@google.com>
> > Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>



>
> Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
>
> Thank you, Eric!
>
> > ---
> >  net/ipv4/tcp_input.c | 38 ++++++++++++++++++++++++++++++--------
> >  1 file changed, 30 insertions(+), 8 deletions(-)
> >
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index bef2c8b64d83a0f3d4cca90f9b12912bf3d00807..501d8d4d4ba46f9a5de322ab690c320757e0990c 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -454,11 +454,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
> >   */
> >
> >  /* Slow part of check#2. */
> > -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
> > +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
> > +                            unsigned int skbtruesize)
> >  {
> >         struct tcp_sock *tp = tcp_sk(sk);
> >         /* Optimize this! */
> > -       int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
> > +       int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
> >         int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
> >
> >         while (tp->rcv_ssthresh <= window) {
> > @@ -471,7 +472,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
> >         return 0;
> >  }
> >
> > -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
> > +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
> > + * can play nice with us, as sk_buff and skb->head might be either
> > + * freed or shared with up to MAX_SKB_FRAGS segments.
> > + * Only give a boost to drivers using page frag(s) to hold the frame(s),
> > + * and if no payload was pulled in skb->head before reaching us.
> > + */
> > +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
> > +{
> > +       u32 truesize = skb->truesize;
> > +
> > +       if (adjust && !skb_headlen(skb)) {
> > +               truesize -= SKB_TRUESIZE(skb_end_offset(skb));
> > +               /* paranoid check, some drivers might be buggy */
> > +               if (unlikely((int)truesize < (int)skb->len))
> > +                       truesize = skb->truesize;
> > +       }
> > +       return truesize;
> > +}
> > +
> > +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
> > +                           bool adjust)
> >  {
> >         struct tcp_sock *tp = tcp_sk(sk);
> >         int room;
> > @@ -480,15 +501,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
> >
> >         /* Check #1 */
> >         if (room > 0 && !tcp_under_memory_pressure(sk)) {
> > +               unsigned int truesize = truesize_adjust(adjust, skb);
> >                 int incr;
> >
> >                 /* Check #2. Increase window, if skb with such overhead
> >                  * will fit to rcvbuf in future.
> >                  */
> > -               if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
> > +               if (tcp_win_from_space(sk, truesize) <= skb->len)
> >                         incr = 2 * tp->advmss;
> >                 else
> > -                       incr = __tcp_grow_window(sk, skb);
> > +                       incr = __tcp_grow_window(sk, skb, truesize);
> >
> >                 if (incr) {
> >                         incr = max_t(int, incr, 2 * skb->len);
> > @@ -782,7 +804,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
> >         tcp_ecn_check_ce(sk, skb);
> >
> >         if (skb->len >= 128)
> > -               tcp_grow_window(sk, skb);
> > +               tcp_grow_window(sk, skb, true);
> >  }
> >
> >  /* Called to compute a smoothed rtt estimate. The data fed to this
> > @@ -4769,7 +4791,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> >                  * and trigger fast retransmit.
> >                  */
> >                 if (tcp_is_sack(tp))
> > -                       tcp_grow_window(sk, skb);
> > +                       tcp_grow_window(sk, skb, true);
> >                 kfree_skb_partial(skb, fragstolen);
> >                 skb = NULL;
> >                 goto add_sack;
> > @@ -4857,7 +4879,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> >                  * and trigger fast retransmit.
> >                  */
> >                 if (tcp_is_sack(tp))
> > -                       tcp_grow_window(sk, skb);
> > +                       tcp_grow_window(sk, skb, false);
> >                 skb_condense(skb);
> >                 skb_set_owner_r(skb, sk);
> >         }
> > --
> > 2.32.0.402.g57bb445576-goog
> >

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-07-21 16:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-21 10:15 [PATCH net-next] tcp: tweak len/truesize ratio for coalesce candidates Eric Dumazet
2021-07-21 13:26 ` Soheil Hassas Yeganeh
2021-07-21 16:24   ` Yuchung Cheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).