Netdev Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Martin KaFai Lau <kafai@fb.com>
To: <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>, <kernel-team@fb.com>,
	<netdev@vger.kernel.org>
Subject: [PATCH bpf-next 1/4] bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
Date: Wed, 4 Aug 2021 22:01:25 -0700	[thread overview]
Message-ID: <20210805050125.1349441-1-kafai@fb.com> (raw)
In-Reply-To: <20210805050119.1349009-1-kafai@fb.com>

This patch allows the bpf-tcp-cc to call bpf_setsockopt.  One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).

During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).

While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release().  To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release().  This will avoid release()
making setsockopt() call that will potentially allocate new resources.

bpf_getsockopt() is also added to have a symmetrical API, so
less usage surprise.

When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put().  Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 kernel/bpf/bpf_struct_ops.c | 22 +++++++++++++++++++++-
 net/ipv4/bpf_tcp_ca.c       | 26 +++++++++++++++++++++++---
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 70f6fd4fa305..d6731c32864e 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -28,6 +28,7 @@ struct bpf_struct_ops_value {
 
 struct bpf_struct_ops_map {
 	struct bpf_map map;
+	struct rcu_head rcu;
 	const struct bpf_struct_ops *st_ops;
 	/* protect map_update */
 	struct mutex lock;
@@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata)
 	return refcount_inc_not_zero(&kvalue->refcnt);
 }
 
+static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+{
+	struct bpf_struct_ops_map *st_map;
+
+	st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+	bpf_map_put(&st_map->map);
+}
+
 void bpf_struct_ops_put(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
@@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata)
 
 		st_map = container_of(kvalue, struct bpf_struct_ops_map,
 				      kvalue);
-		bpf_map_put(&st_map->map);
+		/* The struct_ops's function may switch to another struct_ops.
+		 *
+		 * For example, bpf_tcp_cc_x->init() may switch to
+		 * another tcp_cc_y by calling
+		 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+		 * and its map->refcnt may reach 0 which then free its
+		 * trampoline image while tcp_cc_x is still running.
+		 *
+		 * Thus, a rcu grace period is needed here.
+		 */
+		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
 	}
 }
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 9e41eff4a685..00db0ce3af43 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -10,6 +10,9 @@
 #include <net/tcp.h>
 #include <net/bpf_sk_storage.h>
 
+/* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
 static u32 optional_ops[] = {
 	offsetof(struct tcp_congestion_ops, init),
 	offsetof(struct tcp_congestion_ops, release),
@@ -167,6 +170,10 @@ static const struct bpf_func_proto *
 bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 			  const struct bpf_prog *prog)
 {
+	const struct btf_member *m;
+	const struct btf_type *t;
+	u32 moff, midx;
+
 	switch (func_id) {
 	case BPF_FUNC_tcp_send_ack:
 		return &bpf_tcp_send_ack_proto;
@@ -174,6 +181,22 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 		return &bpf_sk_storage_get_proto;
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
+	case BPF_FUNC_setsockopt:
+		/* Does not allow release() to call setsockopt.
+		 * release() is called when the current bpf-tcp-cc
+		 * is retiring.  It is not allowed to call
+		 * setsockopt() to make further changes which
+		 * may potentially allocate new resources.
+		 */
+		midx = prog->expected_attach_type;
+		t = bpf_tcp_congestion_ops.type;
+		m = &btf_type_member(t)[midx];
+		moff = btf_member_bit_offset(t, m) / 8;
+		if (moff == offsetof(struct tcp_congestion_ops, release))
+			return NULL;
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_sk_getsockopt_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -286,9 +309,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
-/* Avoid sparse warning.  It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
-
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
-- 
2.30.2


  reply	other threads:[~2021-08-05  5:01 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-05  5:01 [PATCH bpf-next 0/4] " Martin KaFai Lau
2021-08-05  5:01 ` Martin KaFai Lau [this message]
2021-08-05  5:01 ` [PATCH bpf-next 2/4] bpf: selftests: Add sk_state to bpf_tcp_helpers.h Martin KaFai Lau
2021-08-05  5:01 ` [PATCH bpf-next 3/4] bpf: selftests: Add connect_to_fd_opts to network_helpers Martin KaFai Lau
2021-08-05  5:01 ` [PATCH bpf-next 4/4] bpf: selftests: Add dctcp fallback test Martin KaFai Lau
2021-08-06 16:07   ` Daniel Borkmann
2021-08-06 17:42     ` Martin KaFai Lau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210805050125.1349441-1-kafai@fb.com \
    --to=kafai@fb.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=netdev@vger.kernel.org \
    --subject='Re: [PATCH bpf-next 1/4] bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).