aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin KaFai Lau <martin.lau@kernel.org>2023-03-22 22:49:40 -0700
committerMartin KaFai Lau <martin.lau@kernel.org>2023-03-22 22:53:27 -0700
commit226bc6ae6405c46a6e9865835c36a1d45fc0b3bf (patch)
tree4122c213c03490408526d5efbce58bc853c742f4
parentb63cbc490e18d893632929b8faa55bb28da3fcd4 (diff)
parent06da9f3bd6418e06719f15340202996f7a4c258d (diff)
downloadbpf-226bc6ae6405c46a6e9865835c36a1d45fc0b3bf.tar.gz
Merge branch 'Transit between BPF TCP congestion controls.'
Kui-Feng Lee says: ==================== Major changes: - Create bpf_links in the kernel for BPF struct_ops to register and unregister it. - Enables switching between implementations of bpf-tcp-cc under a name instantly by replacing the backing struct_ops map of a bpf_link. Previously, BPF struct_ops didn't go off, as even when the user program creating it was terminated, none of these ever were pinned. For instance, the TCP congestion control subsystem indirectly maintains a reference count on the struct_ops of any registered BPF implemented algorithm. Thus, the algorithm won't be deactivated until someone deliberately unregisters it. For compatibility with other BPF programs, bpf_links have been created to work in coordination with struct_ops maps. This ensures that the registration and unregistration of these respective maps is carried out at the start and end of the bpf_link. We also faced complications when attempting to replace an existing TCP congestion control algorithm with a new implementation on the fly. A struct_ops map was used to register a TCP congestion control algorithm with a unique name. We had to either register the alternative implementation with a new name and move over or unregister the current one before being able to reregistration with the same name. To fix this problem, we can an option to migrate the registration of the algorithm from struct_ops maps to bpf_links. By modifying the backing map of a bpf_link, it suddenly becomes possible to replace an existing TCP congestion control algorithm with ease. --- The major differences from v11: - Fix incorrectly setting both old_prog_fd and old_map_fd. The major differences from v10: - Add old_map_fd as an additional field instead of an union in bpf_link_update_opts. The major differences from v9: - Add test case for BPF_F_LINK. Includes adding old_map_fd to struct bpf_link_update_opts in patch 6. - Return -EPERM instead of -EINVAL when the old map fd doesn't match with BPF_F_LINK. - Fix -EBUSY case in bpf_map__attach_struct_ops(). The major differences form v8: - Check bpf_struct_ops::{validate,update} in bpf_struct_ops_map_alloc() The major differences from v7: - Use synchronize_rcu_mult(call_rcu, call_rcu_tasks) to replace synchronize_rcu() and synchronize_rcu_tasks(). - Call synchronize_rcu() in tcp_update_congestion_control(). - Handle -EBUSY in bpf_map__attach_struct_ops() to allow a struct_ops can be used to create links more than once. Include a test case. - Add old_map_fd to bpf_attr and handle BPF_F_REPLACE in bpf_struct_ops_map_link_update(). - Remove changes in bpf_dummy_struct_ops.c and add a check of .update function pointer of bpf_struct_ops. The major differences from v6: - Reword commit logs of the patch 1, 2, and 8. - Call synchronize_rcu_tasks() as well in bpf_struct_ops_map_free(). - Refactor bpf_struct_ops_map_free() so that bpf_struct_ops_map_alloc() can free a struct_ops without waiting for a RCU grace period. The major differences from v5: - Add a new step to bpf_object__load() to prepare vdata. - Accept BPF_F_REPLACE. - Check section IDs in find_struct_ops_map_by_offset() - Add a test case to check mixing w/ and w/o link struct_ops. - Add a test case of using struct_ops w/o link to update a link. - Improve bpf_link__detach_struct_ops() to handle the w/ link case. The major differences from v4: - Rebase. - Reorder patches and merge part 4 to part 2 of the v4. The major differences from v3: - Remove bpf_struct_ops_map_free_rcu(), and use synchronize_rcu(). - Improve the commit log of the part 1. - Before transitioning to the READY state, we conduct a value check to ensure that struct_ops can be successfully utilized and links created later. The major differences from v2: - Simplify states - Remove TOBEUNREG. - Rename UNREG to READY. - Stop using the refcnt of the kvalue of a struct_ops. Explicitly increase and decrease the refcount of struct_ops. - Prepare kernel vdata during the load phase of libbpf. The major differences from v1: - Added bpf_struct_ops_link to replace the previous union-based approach. - Added UNREG and TOBEUNREG to the state of bpf_struct_ops_map. - bpf_struct_ops_transit_state() maintains state transitions. - Fixed synchronization issue. - Prepare kernel vdata of struct_ops during the loading phase of bpf_object. - Merged previous patch 3 to patch 1. v11: https://lore.kernel.org/all/20230323010409.2265383-1-kuifeng@meta.com/ v10: https://lore.kernel.org/all/20230321232813.3376064-1-kuifeng@meta.com/ v9: https://lore.kernel.org/all/20230320195644.1953096-1-kuifeng@meta.com/ v8: https://lore.kernel.org/all/20230318053144.1180301-1-kuifeng@meta.com/ v7: https://lore.kernel.org/all/20230316023641.2092778-1-kuifeng@meta.com/ v6: https://lore.kernel.org/all/20230310043812.3087672-1-kuifeng@meta.com/ v5: https://lore.kernel.org/all/20230308005050.255859-1-kuifeng@meta.com/ v4: https://lore.kernel.org/all/20230307232913.576893-1-andrii@kernel.org/ v3: https://lore.kernel.org/all/20230303012122.852654-1-kuifeng@meta.com/ v2: https://lore.kernel.org/bpf/20230223011238.12313-1-kuifeng@meta.com/ v1: https://lore.kernel.org/bpf/20230214221718.503964-1-kuifeng@meta.com/ ==================== Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
-rw-r--r--include/linux/bpf.h11
-rw-r--r--include/net/tcp.h3
-rw-r--r--include/uapi/linux/bpf.h33
-rw-r--r--kernel/bpf/bpf_struct_ops.c254
-rw-r--r--kernel/bpf/syscall.c63
-rw-r--r--net/ipv4/bpf_tcp_ca.c14
-rw-r--r--net/ipv4/tcp_cong.c66
-rw-r--r--tools/include/uapi/linux/bpf.h33
-rw-r--r--tools/lib/bpf/bpf.c8
-rw-r--r--tools/lib/bpf/bpf.h3
-rw-r--r--tools/lib/bpf/libbpf.c190
-rw-r--r--tools/lib/bpf/libbpf.h1
-rw-r--r--tools/lib/bpf/libbpf.map1
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c160
-rw-r--r--tools/testing/selftests/bpf/progs/tcp_ca_update.c80
15 files changed, 817 insertions, 103 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec0df059f5620f..2d8f3f639e680c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
int (*fill_link_info)(const struct bpf_link *link,
struct bpf_link_info *info);
+ int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *old_map);
};
struct bpf_tramp_link {
@@ -1518,6 +1520,8 @@ struct bpf_struct_ops {
void *kdata, const void *udata);
int (*reg)(void *kdata);
void (*unreg)(void *kdata);
+ int (*update)(void *kdata, void *old_kdata);
+ int (*validate)(void *kdata);
const struct btf_type *type;
const struct btf_type *value_type;
const char *name;
@@ -1552,6 +1556,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
else
module_put(owner);
}
+int bpf_struct_ops_link_create(union bpf_attr *attr);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1597,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
{
return -EINVAL;
}
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
#endif
#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
@@ -1945,6 +1955,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1ee4..2abb755e6a3a76 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+ struct tcp_congestion_ops *old_type);
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 13129df937cde5..e3d3b5160d26f9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -1548,12 +1555,23 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
- /* expected link's program fd; is specified only if
- * BPF_F_REPLACE flag is set in flags */
- __u32 old_prog_fd;
+ union {
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags.
+ */
+ __u32 old_prog_fd;
+ /* expected link's map fd; is specified only
+ * if BPF_F_REPLACE flag is set.
+ */
+ __u32 old_map_fd;
+ };
} link_update;
struct {
@@ -6379,6 +6397,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ba7a94276e3b87..6401deca3b5656 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,11 +11,13 @@
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -58,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -491,12 +508,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ set_memory_rox((long)st_map->image, 1);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -512,7 +546,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -530,14 +563,16 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+ return ERR_PTR(-EOPNOTSUPP);
+
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->links || !st_map->image) {
- bpf_struct_ops_map_free(map);
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -676,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
{
+ struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
bpf_map_put(&st_map->map);
}
-void bpf_struct_ops_put(const void *kdata)
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
- struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
*/
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(&st_map->map);
}
+ kfree(st_link);
}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err = 0;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (!map)
+ return -EINVAL;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 099e9068bcdd84..b4d758fa5981db 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -2823,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -4312,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4575,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4673,6 +4682,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return -EINVAL;
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4693,6 +4731,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd922..e8b27826283ead 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
- if (tcp_ca_find(utcp_ca->name))
- return -EEXIST;
return 1;
}
@@ -266,13 +264,25 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+ return tcp_validate_congestion_control(kdata);
+}
+
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
.unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
.check_member = bpf_tcp_ca_check_member,
.init_member = bpf_tcp_ca_init_member,
.init = bpf_tcp_ca_init,
+ .validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
};
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c314b..1b34050a7538be 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
return NULL;
}
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
{
- int ret = 0;
-
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+ struct tcp_congestion_ops *existing;
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ /* Add the new one before removing the old one to keep
+ * one implementation available all the time.
+ */
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module or struct_ops gets removed entirely.
+ */
+ if (!ret)
+ synchronize_rcu();
+
+ return ret;
+}
+
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 13129df937cde5..d6c5a022ae28dd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* eBPF struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -1548,12 +1555,23 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
- /* expected link's program fd; is specified only if
- * BPF_F_REPLACE flag is set in flags */
- __u32 old_prog_fd;
+ union {
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags.
+ */
+ __u32 old_prog_fd;
+ /* expected link's map fd; is specified only
+ * if BPF_F_REPLACE flag is set.
+ */
+ __u32 old_map_fd;
+ };
} link_update;
struct {
@@ -6379,6 +6397,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e750b6f5fcc368..767035900354da 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -794,11 +794,17 @@ int bpf_link_update(int link_fd, int new_prog_fd,
if (!OPTS_VALID(opts, bpf_link_update_opts))
return libbpf_err(-EINVAL);
+ if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0))
+ return libbpf_err(-EINVAL);
+
memset(&attr, 0, attr_sz);
attr.link_update.link_fd = link_fd;
attr.link_update.new_prog_fd = new_prog_fd;
attr.link_update.flags = OPTS_GET(opts, flags, 0);
- attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
+ if (OPTS_GET(opts, old_prog_fd, 0))
+ attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0);
+ else if (OPTS_GET(opts, old_map_fd, 0))
+ attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0);
ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz);
return libbpf_err_errno(ret);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index f0f78637323818..b073e73439efd4 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -336,8 +336,9 @@ struct bpf_link_update_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
__u32 flags; /* extra flags */
__u32 old_prog_fd; /* expected old program FD */
+ __u32 old_map_fd; /* expected old map FD */
};
-#define bpf_link_update_opts__last_field old_prog_fd
+#define bpf_link_update_opts__last_field old_map_fd
LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
const struct bpf_link_update_opts *opts);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5d32aa8ea38ab1..f6a071db5c6e59 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -116,6 +116,7 @@ static const char * const attach_type_name[] = {
[BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate",
[BPF_PERF_EVENT] = "perf_event",
[BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
+ [BPF_STRUCT_OPS] = "struct_ops",
};
static const char * const link_type_name[] = {
@@ -467,6 +468,7 @@ struct bpf_struct_ops {
#define KCONFIG_SEC ".kconfig"
#define KSYMS_SEC ".ksyms"
#define STRUCT_OPS_SEC ".struct_ops"
+#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
enum libbpf_map_type {
LIBBPF_MAP_UNSPEC,
@@ -596,6 +598,7 @@ struct elf_state {
Elf64_Ehdr *ehdr;
Elf_Data *symbols;
Elf_Data *st_ops_data;
+ Elf_Data *st_ops_link_data;
size_t shstrndx; /* section index for section name strings */
size_t strtabidx;
struct elf_sec_desc *secs;
@@ -605,6 +608,7 @@ struct elf_state {
int text_shndx;
int symbols_shndx;
int st_ops_shndx;
+ int st_ops_link_shndx;
};
struct usdt_manager;
@@ -1118,7 +1122,8 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
return 0;
}
-static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
+ int shndx, Elf_Data *data, __u32 map_flags)
{
const struct btf_type *type, *datasec;
const struct btf_var_secinfo *vsi;
@@ -1129,15 +1134,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
struct bpf_map *map;
__u32 i;
- if (obj->efile.st_ops_shndx == -1)
+ if (shndx == -1)
return 0;
btf = obj->btf;
- datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
+ datasec_id = btf__find_by_name_kind(btf, sec_name,
BTF_KIND_DATASEC);
if (datasec_id < 0) {
pr_warn("struct_ops init: DATASEC %s not found\n",
- STRUCT_OPS_SEC);
+ sec_name);
return -EINVAL;
}
@@ -1150,7 +1155,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
type_id = btf__resolve_type(obj->btf, vsi->type);
if (type_id < 0) {
pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
- vsi->type, STRUCT_OPS_SEC);
+ vsi->type, sec_name);
return -EINVAL;
}
@@ -1169,7 +1174,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (IS_ERR(map))
return PTR_ERR(map);
- map->sec_idx = obj->efile.st_ops_shndx;
+ map->sec_idx = shndx;
map->sec_offset = vsi->offset;
map->name = strdup(var_name);
if (!map->name)
@@ -1179,6 +1184,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
map->def.key_size = sizeof(int);
map->def.value_size = type->size;
map->def.max_entries = 1;
+ map->def.map_flags = map_flags;
map->st_ops = calloc(1, sizeof(*map->st_ops));
if (!map->st_ops)
@@ -1191,14 +1197,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
return -ENOMEM;
- if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
+ if (vsi->offset + type->size > data->d_size) {
pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
- var_name, STRUCT_OPS_SEC);
+ var_name, sec_name);
return -EINVAL;
}
memcpy(st_ops->data,
- obj->efile.st_ops_data->d_buf + vsi->offset,
+ data->d_buf + vsi->offset,
type->size);
st_ops->tname = tname;
st_ops->type = type;
@@ -1211,6 +1217,19 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
return 0;
}
+static int bpf_object_init_struct_ops(struct bpf_object *obj)
+{
+ int err;
+
+ err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx,
+ obj->efile.st_ops_data, 0);
+ err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC,
+ obj->efile.st_ops_link_shndx,
+ obj->efile.st_ops_link_data,
+ BPF_F_LINK);
+ return err;
+}
+
static struct bpf_object *bpf_object__new(const char *path,
const void *obj_buf,
size_t obj_buf_sz,
@@ -1247,6 +1266,7 @@ static struct bpf_object *bpf_object__new(const char *path,
obj->efile.obj_buf_sz = obj_buf_sz;
obj->efile.btf_maps_shndx = -1;
obj->efile.st_ops_shndx = -1;
+ obj->efile.st_ops_link_shndx = -1;
obj->kconfig_map_idx = -1;
obj->kern_version = get_kernel_version();
@@ -1264,6 +1284,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
obj->efile.elf = NULL;
obj->efile.symbols = NULL;
obj->efile.st_ops_data = NULL;
+ obj->efile.st_ops_link_data = NULL;
zfree(&obj->efile.secs);
obj->efile.sec_cnt = 0;
@@ -2618,7 +2639,7 @@ static int bpf_object__init_maps(struct bpf_object *obj,
err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path);
err = err ?: bpf_object__init_global_data_maps(obj);
err = err ?: bpf_object__init_kconfig_map(obj);
- err = err ?: bpf_object__init_struct_ops_maps(obj);
+ err = err ?: bpf_object_init_struct_ops(obj);
return err;
}
@@ -2752,12 +2773,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
{
return obj->efile.btf_maps_shndx >= 0 ||
obj->efile.st_ops_shndx >= 0 ||
+ obj->efile.st_ops_link_shndx >= 0 ||
obj->nr_extern > 0;
}
static bool kernel_needs_btf(const struct bpf_object *obj)
{
- return obj->efile.st_ops_shndx >= 0;
+ return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
}
static int bpf_object__init_btf(struct bpf_object *obj,
@@ -3450,6 +3472,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
obj->efile.st_ops_data = data;
obj->efile.st_ops_shndx = idx;
+ } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
+ obj->efile.st_ops_link_data = data;
+ obj->efile.st_ops_link_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@@ -3464,6 +3489,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
/* Only do relo for section with exec instructions */
if (!section_have_execinstr(obj, targ_sec_idx) &&
strcmp(name, ".rel" STRUCT_OPS_SEC) &&
+ strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
strcmp(name, ".rel" MAPS_ELF_SEC)) {
pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
idx, name, targ_sec_idx,
@@ -6610,7 +6636,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
return -LIBBPF_ERRNO__INTERNAL;
}
- if (idx == obj->efile.st_ops_shndx)
+ if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
err = bpf_object__collect_st_ops_relos(obj, shdr, data);
else if (idx == obj->efile.btf_maps_shndx)
err = bpf_object__collect_map_relos(obj, shdr, data);
@@ -7686,6 +7712,37 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
return 0;
}
+static void bpf_map_prepare_vdata(const struct bpf_map *map)
+{
+ struct bpf_struct_ops *st_ops;
+ __u32 i;
+
+ st_ops = map->st_ops;
+ for (i = 0; i < btf_vlen(st_ops->type); i++) {
+ struct bpf_program *prog = st_ops->progs[i];
+ void *kern_data;
+ int prog_fd;
+
+ if (!prog)
+ continue;
+
+ prog_fd = bpf_program__fd(prog);
+ kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
+ *(unsigned long *)kern_data = prog_fd;
+ }
+}
+
+static int bpf_object_prepare_struct_ops(struct bpf_object *obj)
+{
+ int i;
+
+ for (i = 0; i < obj->nr_maps; i++)
+ if (bpf_map__is_struct_ops(&obj->maps[i]))
+ bpf_map_prepare_vdata(&obj->maps[i]);
+
+ return 0;
+}
+
static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
{
int err, i;
@@ -7711,6 +7768,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
err = err ? : bpf_object__load_progs(obj, extra_log_level);
err = err ? : bpf_object_init_prog_arrays(obj);
+ err = err ? : bpf_object_prepare_struct_ops(obj);
if (obj->gen_loader) {
/* reset FDs */
@@ -8820,6 +8878,7 @@ const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t)
}
static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
+ int sec_idx,
size_t offset)
{
struct bpf_map *map;
@@ -8829,7 +8888,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj,
map = &obj->maps[i];
if (!bpf_map__is_struct_ops(map))
continue;
- if (map->sec_offset <= offset &&
+ if (map->sec_idx == sec_idx &&
+ map->sec_offset <= offset &&
offset - map->sec_offset < map->def.value_size)
return map;
}
@@ -8871,7 +8931,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
}
name = elf_sym_str(obj, sym->st_name) ?: "<?>";
- map = find_struct_ops_map_by_offset(obj, rel->r_offset);
+ map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset);
if (!map) {
pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n",
(size_t)rel->r_offset);
@@ -8938,8 +8998,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
}
/* struct_ops BPF prog can be re-used between multiple
- * .struct_ops as long as it's the same struct_ops struct
- * definition and the same function pointer field
+ * .struct_ops & .struct_ops.link as long as it's the
+ * same struct_ops struct definition and the same
+ * function pointer field
*/
if (prog->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != member_idx) {
@@ -11579,22 +11640,30 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
return link;
}
+struct bpf_link_struct_ops {
+ struct bpf_link link;
+ int map_fd;
+};
+
static int bpf_link__detach_struct_ops(struct bpf_link *link)
{
+ struct bpf_link_struct_ops *st_link;
__u32 zero = 0;
- if (bpf_map_delete_elem(link->fd, &zero))
- return -errno;
+ st_link = container_of(link, struct bpf_link_struct_ops, link);
- return 0;
+ if (st_link->map_fd < 0)
+ /* w/o a real link */
+ return bpf_map_delete_elem(link->fd, &zero);
+
+ return close(link->fd);
}
struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
{
- struct bpf_struct_ops *st_ops;
- struct bpf_link *link;
- __u32 i, zero = 0;
- int err;
+ struct bpf_link_struct_ops *link;
+ __u32 zero = 0;
+ int err, fd;
if (!bpf_map__is_struct_ops(map) || map->fd == -1)
return libbpf_err_ptr(-EINVAL);
@@ -11603,31 +11672,72 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
if (!link)
return libbpf_err_ptr(-EINVAL);
- st_ops = map->st_ops;
- for (i = 0; i < btf_vlen(st_ops->type); i++) {
- struct bpf_program *prog = st_ops->progs[i];
- void *kern_data;
- int prog_fd;
+ /* kern_vdata should be prepared during the loading phase. */
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ /* It can be EBUSY if the map has been used to create or
+ * update a link before. We don't allow updating the value of
+ * a struct_ops once it is set. That ensures that the value
+ * never changed. So, it is safe to skip EBUSY.
+ */
+ if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) {
+ free(link);
+ return libbpf_err_ptr(err);
+ }
- if (!prog)
- continue;
+ link->link.detach = bpf_link__detach_struct_ops;
- prog_fd = bpf_program__fd(prog);
- kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
- *(unsigned long *)kern_data = prog_fd;
+ if (!(map->def.map_flags & BPF_F_LINK)) {
+ /* w/o a real link */
+ link->link.fd = map->fd;
+ link->map_fd = -1;
+ return &link->link;
}
- err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
- if (err) {
- err = -errno;
+ fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL);
+ if (fd < 0) {
free(link);
- return libbpf_err_ptr(err);
+ return libbpf_err_ptr(fd);
}
- link->detach = bpf_link__detach_struct_ops;
- link->fd = map->fd;
+ link->link.fd = fd;
+ link->map_fd = map->fd;
- return link;
+ return &link->link;
+}
+
+/*
+ * Swap the back struct_ops of a link with a new struct_ops map.
+ */
+int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
+{
+ struct bpf_link_struct_ops *st_ops_link;
+ __u32 zero = 0;
+ int err;
+
+ if (!bpf_map__is_struct_ops(map) || map->fd < 0)
+ return -EINVAL;
+
+ st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
+ /* Ensure the type of a link is correct */
+ if (st_ops_link->map_fd < 0)
+ return -EINVAL;
+
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ /* It can be EBUSY if the map has been used to create or
+ * update a link before. We don't allow updating the value of
+ * a struct_ops once it is set. That ensures that the value
+ * never changed. So, it is safe to skip EBUSY.
+ */
+ if (err && err != -EBUSY)
+ return err;
+
+ err = bpf_link_update(link->fd, map->fd, NULL);
+ if (err < 0)
+ return err;
+
+ st_ops_link->map_fd = map->fd;
+
+ return 0;
}
typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index db4992a036f8b6..1615e55e2e7907 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -719,6 +719,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
struct bpf_iter_attach_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 50dde1f6521ef5..a5aa3a383d694f 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -386,6 +386,7 @@ LIBBPF_1.1.0 {
LIBBPF_1.2.0 {
global:
bpf_btf_get_info_by_fd;
+ bpf_link__update_map;
bpf_link_get_info_by_fd;
bpf_map_get_info_by_fd;
bpf_prog_get_info_by_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index e980188d41246f..a53c254c605807 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -8,6 +8,7 @@
#include "bpf_dctcp.skel.h"
#include "bpf_cubic.skel.h"
#include "bpf_tcp_nogpl.skel.h"
+#include "tcp_ca_update.skel.h"
#include "bpf_dctcp_release.skel.h"
#include "tcp_ca_write_sk_pacing.skel.h"
#include "tcp_ca_incompl_cong_ops.skel.h"
@@ -381,6 +382,155 @@ static void test_unsupp_cong_op(void)
libbpf_set_print(old_print_fn);
}
+static void test_update_ca(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+ int saved_ca1_cnt;
+ int err;
+
+ skel = tcp_ca_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ saved_ca1_cnt = skel->bss->ca1_cnt;
+ ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+ err = bpf_link__update_map(link, skel->maps.ca_update_2);
+ ASSERT_OK(err, "update_map");
+
+ do_test("tcp_ca_update", NULL);
+ ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+ ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
+
+ bpf_link__destroy(link);
+ tcp_ca_update__destroy(skel);
+}
+
+static void test_update_wrong(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+ int saved_ca1_cnt;
+ int err;
+
+ skel = tcp_ca_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ saved_ca1_cnt = skel->bss->ca1_cnt;
+ ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+ err = bpf_link__update_map(link, skel->maps.ca_wrong);
+ ASSERT_ERR(err, "update_map");
+
+ do_test("tcp_ca_update", NULL);
+ ASSERT_GT(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+
+ bpf_link__destroy(link);
+ tcp_ca_update__destroy(skel);
+}
+
+static void test_mixed_links(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link, *link_nl;
+ int err;
+
+ skel = tcp_ca_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ link_nl = bpf_map__attach_struct_ops(skel->maps.ca_no_link);
+ ASSERT_OK_PTR(link_nl, "attach_struct_ops_nl");
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ ASSERT_GT(skel->bss->ca1_cnt, 0, "ca1_ca1_cnt");
+
+ err = bpf_link__update_map(link, skel->maps.ca_no_link);
+ ASSERT_ERR(err, "update_map");
+
+ bpf_link__destroy(link);
+ bpf_link__destroy(link_nl);
+ tcp_ca_update__destroy(skel);
+}
+
+static void test_multi_links(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+
+ skel = tcp_ca_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops_1st");
+ bpf_link__destroy(link);
+
+ /* A map should be able to be used to create links multiple
+ * times.
+ */
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops_2nd");
+ bpf_link__destroy(link);
+
+ tcp_ca_update__destroy(skel);
+}
+
+static void test_link_replace(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts);
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+ int err;
+
+ skel = tcp_ca_update__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops_1st");
+ bpf_link__destroy(link);
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_2);
+ ASSERT_OK_PTR(link, "attach_struct_ops_2nd");
+
+ /* BPF_F_REPLACE with a wrong old map Fd. It should fail!
+ *
+ * With BPF_F_REPLACE, the link should be updated only if the
+ * old map fd given here matches the map backing the link.
+ */
+ opts.old_map_fd = bpf_map__fd(skel->maps.ca_update_1);
+ opts.flags = BPF_F_REPLACE;
+ err = bpf_link_update(bpf_link__fd(link),
+ bpf_map__fd(skel->maps.ca_update_1),
+ &opts);
+ ASSERT_ERR(err, "bpf_link_update_fail");
+
+ /* BPF_F_REPLACE with a correct old map Fd. It should success! */
+ opts.old_map_fd = bpf_map__fd(skel->maps.ca_update_2);
+ err = bpf_link_update(bpf_link__fd(link),
+ bpf_map__fd(skel->maps.ca_update_1),
+ &opts);
+ ASSERT_OK(err, "bpf_link_update_success");
+
+ bpf_link__destroy(link);
+
+ tcp_ca_update__destroy(skel);
+}
+
void test_bpf_tcp_ca(void)
{
if (test__start_subtest("dctcp"))
@@ -399,4 +549,14 @@ void test_bpf_tcp_ca(void)
test_incompl_cong_ops();
if (test__start_subtest("unsupp_cong_op"))
test_unsupp_cong_op();
+ if (test__start_subtest("update_ca"))
+ test_update_ca();
+ if (test__start_subtest("update_wrong"))
+ test_update_wrong();
+ if (test__start_subtest("mixed_links"))
+ test_mixed_links();
+ if (test__start_subtest("multi_links"))
+ test_multi_links();
+ if (test__start_subtest("link_replace"))
+ test_link_replace();
}
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
new file mode 100644
index 00000000000000..b93a0ed3305780
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int ca1_cnt = 0;
+int ca2_cnt = 0;
+
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+ return (struct tcp_sock *)sk;
+}
+
+SEC("struct_ops/ca_update_1_init")
+void BPF_PROG(ca_update_1_init, struct sock *sk)
+{
+ ca1_cnt++;
+}
+
+SEC("struct_ops/ca_update_2_init")
+void BPF_PROG(ca_update_2_init, struct sock *sk)
+{
+ ca2_cnt++;
+}
+
+SEC("struct_ops/ca_update_cong_control")
+void BPF_PROG(ca_update_cong_control, struct sock *sk,
+ const struct rate_sample *rs)
+{
+}
+
+SEC("struct_ops/ca_update_ssthresh")
+__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+SEC("struct_ops/ca_update_undo_cwnd")
+__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_1 = {
+ .init = (void *)ca_update_1_init,
+ .cong_control = (void *)ca_update_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_2 = {
+ .init = (void *)ca_update_2_init,
+ .cong_control = (void *)ca_update_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_wrong = {
+ .cong_control = (void *)ca_update_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_wrong",
+};
+
+SEC(".struct_ops")
+struct tcp_congestion_ops ca_no_link = {
+ .cong_control = (void *)ca_update_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_no_link",
+};