Files
linux/net/mptcp/pm_kernel.c
Matthieu Baerts (NGI0) 539f6b9de3 mptcp: pm: in-kernel: add laminar endpoints
Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag
is not used), the in-kernel PM will create new subflows using the local
address the routing configuration will pick.

It would be easier to pick local addresses from a selected list of
endpoints, and use it only once, than relying on routing rules.

Use case: both the client (C) and the server (S) have two addresses (a
and b). The client establishes the connection between C(a) and S(a).
Once established, the server announces its additional address S(b). Once
received, the client connects to it using its second address C(b).
Compared to a situation without the 'laminar' endpoint for C(b), the
client didn't use this address C(b) to establish a subflow to the
server's primary address S(a). So at the end, we have:

   C        S
  C(a) --- S(a)
  C(b) --- S(b)

In case of a 3rd address on each side (C(c) and S(c)), upon the
reception of an ADD_ADDR with S(c), the client should not pick C(b)
because it has already been used. C(c) should then be used.

Note that this situation is currently possible if C doesn't add any
endpoint, but configure the routing in order to pick C(b) for the route
to S(b), and pick C(c) for the route to S(c). That doesn't sound very
practical because it means knowing in advance the IP addresses that
will be used and announced by the server.

'laminar', like the idea of laminar flows: the different subflows don't
mix with each other on an endpoint, unlike the "turbulent" way traffic
is mixed by 'fullmesh'.

In the code, the new endpoint type is added. Similar to the other
subflow types, an MPTCP_INFO counter is added. While at it, hole are now
commented in struct mptcp_info, to remember next time that these holes
can no longer be used.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-15-ad126cc47c6b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-09-26 17:44:05 -07:00

1589 lines
42 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2025, Matthieu Baerts.
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <net/netns/generic.h>
#include "protocol.h"
#include "mib.h"
#include "mptcp_pm_gen.h"
static int pm_nl_pernet_id;
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
struct list_head endp_list;
u8 endpoints;
u8 endp_signal_max;
u8 endp_subflow_max;
u8 endp_laminar_max;
u8 limit_add_addr_accepted;
u8 limit_extra_subflows;
u8 next_id;
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
};
#define MPTCP_PM_ADDR_MAX 8
static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
{
return net_generic(net, pm_nl_pernet_id);
}
static struct pm_nl_pernet *
pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
{
return pm_nl_get_pernet(sock_net((struct sock *)msk));
}
static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
{
return pm_nl_get_pernet(genl_info_net(info));
}
u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk)
{
const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->endp_signal_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max);
u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->endp_subflow_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->endp_laminar_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max);
u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->limit_add_addr_accepted);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted);
u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->limit_extra_subflows);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows);
static bool lookup_subflow_by_daddr(const struct list_head *list,
const struct mptcp_addr_info *daddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
list_for_each_entry(subflow, list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!((1 << inet_sk_state_load(ssk)) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV)))
continue;
mptcp_remote_address((struct sock_common *)ssk, &cur);
if (mptcp_addresses_equal(&cur, daddr, daddr->port))
return true;
}
return false;
}
static bool
select_local_address(const struct pm_nl_pernet *pernet,
const struct mptcp_sock *msk,
struct mptcp_pm_local *new_local)
{
struct mptcp_pm_addr_entry *entry;
bool found = false;
msk_owned_by_me(msk);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
new_local->addr = entry->addr;
new_local->flags = entry->flags;
new_local->ifindex = entry->ifindex;
found = true;
break;
}
rcu_read_unlock();
return found;
}
static bool
select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
struct mptcp_pm_local *new_local)
{
struct mptcp_pm_addr_entry *entry;
bool found = false;
rcu_read_lock();
/* do not keep any additional per socket state, just signal
* the address list in order.
* Note: removal from the local address list during the msk life-cycle
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
new_local->addr = entry->addr;
new_local->flags = entry->flags;
new_local->ifindex = entry->ifindex;
found = true;
break;
}
rcu_read_unlock();
return found;
}
static unsigned int
fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local,
struct mptcp_addr_info *addrs)
{
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
struct mptcp_addr_info remote = { 0 };
struct sock *sk = (struct sock *)msk;
if (deny_id0)
return 0;
mptcp_remote_address((struct sock_common *)sk, &remote);
if (!mptcp_pm_addr_families_match(sk, local, &remote))
return 0;
msk->pm.extra_subflows++;
*addrs = remote;
return 1;
}
static unsigned int
fill_remote_addresses_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *local,
struct mptcp_addr_info *addrs)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow;
int i = 0;
/* Forbid creation of new subflows matching existing ones, possibly
* already created by incoming ADD_ADDR
*/
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow)
if (READ_ONCE(subflow->local_id) == local->id)
__set_bit(subflow->remote_id, unavail_id);
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
addrs[i].id = READ_ONCE(subflow->remote_id);
if (deny_id0 && !addrs[i].id)
continue;
if (test_bit(addrs[i].id, unavail_id))
continue;
if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
continue;
/* forbid creating multiple address towards this id */
__set_bit(addrs[i].id, unavail_id);
msk->pm.extra_subflows++;
i++;
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
return i;
}
/* Fill all the remote addresses into the array addrs[],
* and return the array size.
*/
static unsigned int
fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local,
bool fullmesh, struct mptcp_addr_info *addrs)
{
/* Non-fullmesh: fill in the single entry corresponding to the primary
* MPC subflow remote address, and return 1, corresponding to 1 entry.
*/
if (!fullmesh)
return fill_remote_addr(msk, local, addrs);
/* Fullmesh endpoint: fill all possible remote addresses */
return fill_remote_addresses_fullmesh(msk, local, addrs);
}
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (entry->addr.id == id)
return entry;
}
return NULL;
}
static struct mptcp_pm_addr_entry *
__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
}
return NULL;
}
static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
}
/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */
static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
struct pm_nl_pernet *pernet;
bool backup = false;
/* do lazy endpoint usage accounting for the MPC subflows */
if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) ||
!msk->first)
return;
subflow = mptcp_subflow_ctx(msk->first);
pernet = pm_nl_get_pernet_from_msk(msk);
mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
rcu_read_lock();
entry = __lookup_addr(pernet, &mpc_addr);
if (entry) {
__clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
msk->mpc_endpoint_id = entry->addr.id;
backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
}
rcu_read_unlock();
/* Send MP_PRIO */
if (backup)
mptcp_pm_send_ack(msk, subflow, true, backup);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk);
struct sock *sk = (struct sock *)msk;
bool signal_and_subflow = false;
struct mptcp_pm_local local;
mptcp_mpc_endpoint_setup(msk);
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, endp_subflow_max,
msk->pm.add_addr_signaled, endp_signal_max,
msk->pm.extra_subflows, limit_extra_subflows);
/* check first for announce */
if (msk->pm.add_addr_signaled < endp_signal_max) {
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
* corresponding id will be marked as used.
* Instead let the PM machinery reschedule us when the
* current address announce will be completed.
*/
if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
return;
if (!select_signal_address(pernet, msk, &local))
goto subflow;
/* If the alloc fails, we are on memory pressure, not worth
* continuing, and trying to create subflows.
*/
if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
return;
__clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
/* Special case for ID0: set the correct ID */
if (local.addr.id == msk->mpc_endpoint_id)
local.addr.id = 0;
mptcp_pm_announce_addr(msk, &local.addr, false);
mptcp_pm_addr_send_ack(msk);
if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
signal_and_subflow = true;
}
subflow:
/* check if should create a new subflow */
while (msk->pm.local_addr_used < endp_subflow_max &&
msk->pm.extra_subflows < limit_extra_subflows) {
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
bool fullmesh;
int i, nr;
if (signal_and_subflow)
signal_and_subflow = false;
else if (!select_local_address(pernet, msk, &local))
break;
fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
__clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
/* Special case for ID0: set the correct ID */
if (local.addr.id == msk->mpc_endpoint_id)
local.addr.id = 0;
else /* local_addr_used is not decr for ID 0 */
msk->pm.local_addr_used++;
nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs);
if (nr == 0)
continue;
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
__mptcp_subflow_connect(sk, &local, &addrs[i]);
spin_lock_bh(&msk->pm.lock);
}
mptcp_pm_nl_check_work_pending(msk);
}
static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
static unsigned int
fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals,
bool c_flag_case)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
struct mptcp_pm_local *local;
int i = 0;
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
bool is_id0;
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
local = &locals[i];
local->addr = entry->addr;
local->flags = entry->flags;
local->ifindex = entry->ifindex;
is_id0 = local->addr.id == msk->mpc_endpoint_id;
if (c_flag_case &&
(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (!is_id0)
msk->pm.local_addr_used++;
}
/* Special case for ID0: set the correct ID */
if (is_id0)
local->addr.id = 0;
msk->pm.extra_subflows++;
i++;
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
rcu_read_unlock();
return i;
}
static unsigned int
fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
struct mptcp_pm_local *local;
int found = 0;
/* Forbid creation of new subflows matching existing ones, possibly
* already created by 'subflow' endpoints
*/
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if ((1 << inet_sk_state_load(ssk)) &
(TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING |
TCPF_CLOSE))
continue;
__set_bit(subflow_get_local_id(subflow), unavail_id);
}
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
unavail_id))
continue;
local = &locals[0];
local->addr = entry->addr;
local->flags = entry->flags;
local->ifindex = entry->ifindex;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (local->addr.id != msk->mpc_endpoint_id)
msk->pm.local_addr_used++;
}
msk->pm.extra_subflows++;
found = 1;
break;
}
rcu_read_unlock();
return found;
}
static unsigned int
fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_local *local;
int i = 0;
while (msk->pm.local_addr_used < endp_subflow_max) {
local = &locals[i];
if (!select_local_address(pernet, msk, local))
break;
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
continue;
if (local->addr.id == msk->mpc_endpoint_id)
continue;
msk->pm.local_addr_used++;
msk->pm.extra_subflows++;
i++;
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
return i;
}
static unsigned int
fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *local)
{
struct sock *sk = (struct sock *)msk;
memset(local, 0, sizeof(*local));
local->addr.family =
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
remote->family == AF_INET6 &&
ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
#endif
remote->family;
if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
return 0;
msk->pm.extra_subflows++;
return 1;
}
/* Fill all the local addresses into the array addrs[],
* and return the array size.
*/
static unsigned int
fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk);
int i;
/* If there is at least one MPTCP endpoint with a fullmesh flag */
i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case);
if (i)
return i;
/* If there is at least one MPTCP endpoint with a laminar flag */
if (mptcp_pm_get_endp_laminar_max(msk))
return fill_local_laminar_endp(msk, remote, locals);
/* Special case: peer sets the C flag, accept one ADD_ADDR if default
* limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
*/
if (c_flag_case)
return fill_local_addresses_vec_c_flag(msk, remote, locals);
/* No special case: fill in the single 'IPADDRANY' local address */
return fill_local_address_any(msk, remote, &locals[0]);
}
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk);
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info remote;
bool sf_created = false;
int i, nr;
pr_debug("accepted %d:%d remote family %d\n",
msk->pm.add_addr_accepted, limit_add_addr_accepted,
msk->pm.remote.family);
remote = msk->pm.remote;
mptcp_pm_announce_addr(msk, &remote, true);
mptcp_pm_addr_send_ack(msk);
mptcp_mpc_endpoint_setup(msk);
if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
return;
/* pick id 0 port, if none is provided the remote address */
if (!remote.port)
remote.port = sk->sk_dport;
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
nr = fill_local_addresses_vec(msk, &remote, locals);
if (nr == 0)
return;
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0)
sf_created = true;
spin_lock_bh(&msk->pm.lock);
if (sf_created) {
/* add_addr_accepted is not decr for ID 0 */
if (remote.id)
msk->pm.add_addr_accepted++;
if (msk->pm.add_addr_accepted >= limit_add_addr_accepted ||
msk->pm.extra_subflows >= limit_extra_subflows)
WRITE_ONCE(msk->pm.accept_addr, false);
}
}
void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id)
{
if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
u8 limit_add_addr_accepted =
mptcp_pm_get_limit_add_addr_accepted(msk);
/* Note: if the subflow has been closed before, this
* add_addr_accepted counter will not be decremented.
*/
if (--msk->pm.add_addr_accepted < limit_add_addr_accepted)
WRITE_ONCE(msk->pm.accept_addr, true);
}
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
return (entry->flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
/* caller must ensure the RCU grace period is already elapsed */
static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
{
if (entry->lsk)
sock_release(entry->lsk);
kfree(entry);
}
static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry,
bool needs_id, bool replace)
{
struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
int ret = -EINVAL;
u8 addr_max;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) {
ret = -ERANGE;
goto out;
}
if (test_bit(entry->addr.id, pernet->id_bitmap)) {
ret = -EBUSY;
goto out;
}
/* do not insert duplicate address, differentiate on port only
* singled addresses
*/
if (!address_use_port(entry))
entry->addr.port = 0;
list_for_each_entry(cur, &pernet->endp_list, list) {
if (mptcp_addresses_equal(&cur->addr, &entry->addr,
cur->addr.port || entry->addr.port)) {
/* allow replacing the exiting endpoint only if such
* endpoint is an implicit one and the user-space
* did not provide an endpoint id
*/
if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
ret = -EEXIST;
goto out;
}
if (entry->addr.id)
goto out;
/* allow callers that only need to look up the local
* addr's id to skip replacement. This allows them to
* avoid calling synchronize_rcu in the packet recv
* path.
*/
if (!replace) {
kfree(entry);
ret = cur->addr.id;
goto out;
}
pernet->endpoints--;
entry->addr.id = cur->addr.id;
list_del_rcu(&cur->list);
del_entry = cur;
break;
}
}
if (!entry->addr.id && needs_id) {
find_next:
entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1,
pernet->next_id);
if (!entry->addr.id && pernet->next_id != 1) {
pernet->next_id = 1;
goto find_next;
}
}
if (!entry->addr.id && needs_id)
goto out;
__set_bit(entry->addr.id, pernet->id_bitmap);
if (entry->addr.id > pernet->next_id)
pernet->next_id = entry->addr.id;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->endp_signal_max;
WRITE_ONCE(pernet->endp_signal_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->endp_subflow_max;
WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
addr_max = pernet->endp_laminar_max;
WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1);
}
pernet->endpoints++;
if (!entry->addr.port)
list_add_tail_rcu(&entry->list, &pernet->endp_list);
else
list_add_rcu(&entry->list, &pernet->endp_list);
ret = entry->addr.id;
out:
spin_unlock_bh(&pernet->lock);
/* just replaced an existing entry, free it */
if (del_entry) {
synchronize_rcu();
__mptcp_pm_release_addr_entry(del_entry);
}
return ret;
}
static struct lock_class_key mptcp_slock_keys[2];
static struct lock_class_key mptcp_keys[2];
static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
struct mptcp_pm_addr_entry *entry)
{
bool is_ipv6 = sk->sk_family == AF_INET6;
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
struct sock *newsk, *ssk;
int backlog = 1024;
int err;
err = sock_create_kern(sock_net(sk), entry->addr.family,
SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
if (err)
return err;
newsk = entry->lsk->sk;
if (!newsk)
return -EINVAL;
/* The subflow socket lock is acquired in a nested to the msk one
* in several places, even by the TCP stack, and this msk is a kernel
* socket: lockdep complains. Instead of propagating the _nested
* modifiers in several places, re-init the lock class for the msk
* socket to an mptcp specific one.
*/
sock_lock_init_class_and_name(newsk,
is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET",
&mptcp_slock_keys[is_ipv6],
is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET",
&mptcp_keys[is_ipv6]);
lock_sock(newsk);
ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
release_sock(newsk);
if (IS_ERR(ssk))
return PTR_ERR(ssk);
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (entry->addr.family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
if (ssk->sk_family == AF_INET)
err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (ssk->sk_family == AF_INET6)
err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#endif
if (err)
return err;
/* We don't use mptcp_set_state() here because it needs to be called
* under the msk socket lock. For the moment, that will not bring
* anything more than only calling inet_sk_state_store(), because the
* old status is known (TCP_CLOSE).
*/
inet_sk_state_store(newsk, TCP_LISTEN);
lock_sock(ssk);
WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true);
err = __inet_listen_sk(ssk, backlog);
if (!err)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
release_sock(ssk);
return err;
}
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *skc)
{
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
int ret;
pernet = pm_nl_get_pernet_from_msk(msk);
rcu_read_lock();
entry = __lookup_addr(pernet, &skc->addr);
ret = entry ? entry->addr.id : -1;
rcu_read_unlock();
if (ret >= 0)
return ret;
/* address not found, add to local list */
entry = kmemdup(skc, sizeof(*skc), GFP_ATOMIC);
if (!entry)
return -ENOMEM;
entry->addr.port = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false);
if (ret < 0)
kfree(entry);
return ret;
}
bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
struct mptcp_pm_addr_entry *entry;
bool backup;
rcu_read_lock();
entry = __lookup_addr(pernet, skc);
backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
rcu_read_unlock();
return backup;
}
static int mptcp_nl_add_subflow_or_signal_addr(struct net *net,
struct mptcp_addr_info *addr)
{
struct mptcp_sock *msk;
long s_slot = 0, s_num = 0;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info mpc_addr;
if (!READ_ONCE(msk->fully_established) ||
mptcp_pm_is_userspace(msk))
goto next;
/* if the endp linked to the init sf is re-added with a != ID */
mptcp_local_address((struct sock_common *)msk, &mpc_addr);
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
if (mptcp_addresses_equal(addr, &mpc_addr, addr->port))
msk->mpc_endpoint_id = addr->id;
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr,
struct genl_info *info)
{
struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
mptcp_pm_address_nl_policy, info->extack) &&
tb[MPTCP_PM_ADDR_ATTR_ID])
return true;
return false;
}
/* Add an MPTCP endpoint */
int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
struct nlattr *attr;
int ret;
if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
return -EINVAL;
attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
ret = mptcp_pm_parse_entry(attr, info, true, &addr);
if (ret < 0)
return ret;
if (addr.addr.port && !address_use_port(&addr)) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"flags must have signal and not subflow when using port");
return -EINVAL;
}
if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"flags mustn't have both signal and fullmesh");
return -EINVAL;
}
if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"can't create IMPLICIT endpoint");
return -EINVAL;
}
entry = kmemdup(&addr, sizeof(addr), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
}
if (entry->addr.port) {
ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
if (ret) {
GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
goto out_free;
}
}
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry,
!mptcp_pm_has_addr_attr_id(attr, info),
true);
if (ret < 0) {
GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
goto out_free;
}
mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr);
return 0;
out_free:
__mptcp_pm_release_addr_entry(entry);
return ret;
}
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
{
struct mptcp_rm_list list = { .nr = 0 };
bool ret;
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
ret = mptcp_remove_anno_list_by_saddr(msk, addr);
if (ret || force) {
spin_lock_bh(&msk->pm.lock);
if (ret) {
__set_bit(addr->id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled--;
}
mptcp_pm_remove_addr(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
return ret;
}
static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id)
{
/* If it was marked as used, and not ID 0, decrement local_addr_used */
if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) &&
id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0))
msk->pm.local_addr_used--;
}
static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
const struct mptcp_pm_addr_entry *entry)
{
const struct mptcp_addr_info *addr = &entry->addr;
struct mptcp_rm_list list = { .nr = 1 };
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
pr_debug("remove_id=%d\n", addr->id);
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
bool remove_subflow;
if (mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr);
mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
!(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
list.ids[0] = mptcp_endp_get_local_id(msk, addr);
if (remove_subflow) {
spin_lock_bh(&msk->pm.lock);
mptcp_pm_rm_subflow(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
spin_lock_bh(&msk->pm.lock);
__mark_subflow_endp_available(msk, list.ids[0]);
spin_unlock_bh(&msk->pm.lock);
}
if (msk->mpc_endpoint_id == entry->addr.id)
msk->mpc_endpoint_id = 0;
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
static int mptcp_nl_remove_id_zero_address(struct net *net,
struct mptcp_addr_info *addr)
{
struct mptcp_rm_list list = { .nr = 0 };
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
list.ids[list.nr++] = 0;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info msk_local;
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
mptcp_local_address((struct sock_common *)msk, &msk_local);
if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
goto next;
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
mptcp_pm_remove_addr(msk, &list);
mptcp_pm_rm_subflow(msk, &list);
__mark_subflow_endp_available(msk, 0);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
/* Remove an MPTCP endpoint */
int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
struct nlattr *attr;
u8 addr_max;
int ret;
if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
return -EINVAL;
attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
/* the zero id address is special: the first address used by the msk
* always gets such an id, so different subflows can have different zero
* id addresses. Additionally zero id is not accounted for in id_bitmap.
* Let's use an 'mptcp_rm_list' instead of the common remove code.
*/
if (addr.addr.id == 0)
return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr);
spin_lock_bh(&pernet->lock);
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found");
spin_unlock_bh(&pernet->lock);
return -EINVAL;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->endp_signal_max;
WRITE_ONCE(pernet->endp_signal_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->endp_subflow_max;
WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
addr_max = pernet->endp_laminar_max;
WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1);
}
pernet->endpoints--;
list_del_rcu(&entry->list);
__clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
synchronize_rcu();
__mptcp_pm_release_addr_entry(entry);
return ret;
}
static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
struct list_head *rm_list)
{
struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, rm_list, list) {
if (slist.nr < MPTCP_RM_IDS_MAX &&
mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
if (alist.nr < MPTCP_RM_IDS_MAX &&
mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
}
spin_lock_bh(&msk->pm.lock);
if (alist.nr) {
msk->pm.add_addr_signaled -= alist.nr;
mptcp_pm_remove_addr(msk, &alist);
}
if (slist.nr)
mptcp_pm_rm_subflow(msk, &slist);
/* Reset counters: maybe some subflows have been removed before */
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
msk->pm.local_addr_used = 0;
spin_unlock_bh(&msk->pm.lock);
}
static void mptcp_nl_flush_addrs_list(struct net *net,
struct list_head *rm_list)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
if (list_empty(rm_list))
return;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (!mptcp_pm_is_userspace(msk)) {
lock_sock(sk);
mptcp_pm_flush_addrs_and_subflows(msk, rm_list);
release_sock(sk);
}
sock_put(sk);
cond_resched();
}
}
/* caller must ensure the RCU grace period is already elapsed */
static void __flush_addrs(struct list_head *list)
{
while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
list_del_rcu(&cur->list);
__mptcp_pm_release_addr_entry(cur);
}
}
static void __reset_counters(struct pm_nl_pernet *pernet)
{
WRITE_ONCE(pernet->endp_signal_max, 0);
WRITE_ONCE(pernet->endp_subflow_max, 0);
WRITE_ONCE(pernet->endp_laminar_max, 0);
pernet->endpoints = 0;
}
int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
list_splice_init(&pernet->endp_list, &free_list);
__reset_counters(pernet);
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list);
synchronize_rcu();
__flush_addrs(&free_list);
return 0;
}
int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry *entry;
int ret = -EINVAL;
rcu_read_lock();
entry = __lookup_addr_by_id(pernet, id);
if (entry) {
*addr = *entry;
ret = 0;
}
rcu_read_unlock();
return ret;
}
int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
struct netlink_callback *cb)
{
struct net *net = sock_net(msg->sk);
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
int id = cb->args[0];
int i;
pernet = pm_nl_get_pernet(net);
rcu_read_lock();
for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
if (test_bit(i, pernet->id_bitmap)) {
entry = __lookup_addr_by_id(pernet, i);
if (!entry)
break;
if (entry->addr.id <= id)
continue;
if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0)
break;
id = entry->addr.id;
}
}
rcu_read_unlock();
cb->args[0] = id;
return msg->len;
}
static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
{
struct nlattr *attr = info->attrs[id];
if (!attr)
return 0;
*limit = nla_get_u32(attr);
if (*limit > MPTCP_PM_ADDR_MAX) {
NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr,
"limit greater than maximum (%u)",
MPTCP_PM_ADDR_MAX);
return -EINVAL;
}
return 0;
}
int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
unsigned int rcv_addrs, subflows;
int ret;
spin_lock_bh(&pernet->lock);
rcv_addrs = pernet->limit_add_addr_accepted;
ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
if (ret)
goto unlock;
subflows = pernet->limit_extra_subflows;
ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
if (ret)
goto unlock;
WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs);
WRITE_ONCE(pernet->limit_extra_subflows, subflows);
unlock:
spin_unlock_bh(&pernet->lock);
return ret;
}
int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct sk_buff *msg;
void *reply;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
MPTCP_PM_CMD_GET_LIMITS);
if (!reply)
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
READ_ONCE(pernet->limit_add_addr_accepted)))
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
READ_ONCE(pernet->limit_extra_subflows)))
goto fail;
genlmsg_end(msg, reply);
return genlmsg_reply(msg, info);
fail:
GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
nlmsg_free(msg);
return -EMSGSIZE;
}
static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *addr)
{
struct mptcp_rm_list list = { .nr = 0 };
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
spin_lock_bh(&msk->pm.lock);
mptcp_pm_rm_subflow(msk, &list);
__mark_subflow_endp_available(msk, list.ids[0]);
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
}
static void mptcp_pm_nl_set_flags_all(struct net *net,
struct mptcp_pm_addr_entry *local,
u8 changed)
{
u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW);
u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow)
return;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup);
/* Subflows will only be recreated if the SUBFLOW flag is set */
if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH))
mptcp_pm_nl_fullmesh(msk, &local->addr);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
}
int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local,
struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
MPTCP_PM_ADDR_FLAG_FULLMESH;
struct net *net = genl_info_net(info);
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
u8 lookup_by_id = 0;
pernet = pm_nl_get_pernet(net);
if (local->addr.family == AF_UNSPEC) {
lookup_by_id = 1;
if (!local->addr.id) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing address ID");
return -EOPNOTSUPP;
}
}
spin_lock_bh(&pernet->lock);
entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) :
__lookup_addr(pernet, &local->addr);
if (!entry) {
spin_unlock_bh(&pernet->lock);
NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found");
return -EINVAL;
}
if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
(entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL |
MPTCP_PM_ADDR_FLAG_IMPLICIT))) {
spin_unlock_bh(&pernet->lock);
NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags");
return -EINVAL;
}
changed = (local->flags ^ entry->flags) & mask;
entry->flags = (entry->flags & ~mask) | (local->flags & mask);
*local = *entry;
spin_unlock_bh(&pernet->lock);
mptcp_pm_nl_set_flags_all(net, local, changed);
return 0;
}
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false);
return false;
}
return true;
}
/* Called under PM lock */
void __mptcp_pm_kernel_worker(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
}
if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
mptcp_pm_nl_fully_established(msk);
}
if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
mptcp_pm_nl_subflow_established(msk);
}
}
static int __net_init pm_nl_init_net(struct net *net)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
INIT_LIST_HEAD_RCU(&pernet->endp_list);
/* Cit. 2 subflows ought to be enough for anybody. */
pernet->limit_extra_subflows = 2;
pernet->next_id = 1;
spin_lock_init(&pernet->lock);
/* No need to initialize other pernet fields, the struct is zeroed at
* allocation time.
*/
return 0;
}
static void __net_exit pm_nl_exit_net(struct list_head *net_list)
{
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
/* net is removed from namespace list, can't race with
* other modifiers, also netns core already waited for a
* RCU grace period.
*/
__flush_addrs(&pernet->endp_list);
}
}
static struct pernet_operations mptcp_pm_pernet_ops = {
.init = pm_nl_init_net,
.exit_batch = pm_nl_exit_net,
.id = &pm_nl_pernet_id,
.size = sizeof(struct pm_nl_pernet),
};
struct mptcp_pm_ops mptcp_pm_kernel = {
.name = "kernel",
.owner = THIS_MODULE,
};
void __init mptcp_pm_kernel_register(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
mptcp_pm_register(&mptcp_pm_kernel);
}