Files
linux/fs/pnode.c
Al Viro f0d0ba1998 Rewrite of propagate_umount()
The variant currently in the tree has problems; trying to prove
correctness has caught at least one class of bugs (reparenting
that ends up moving the visible location of reparented mount, due
to not excluding some of the counterparts on propagation that
should've been included).

I tried to prove that it's the only bug there; I'm still not sure
whether it is.  If anyone can reconstruct and write down an analysis
of the mainline implementation, I'll gladly review it; as it is,
I ended up doing a different implementation.  Candidate collection
phase is similar, but trimming the set down until it satisfies the
constraints turned out pretty different.

I hoped to do transformation as a massage series, but that turns out
to be too convoluted.  So it's a single patch replacing propagate_umount()
and friends in one go, with notes and analysis in D/f/propagate_umount.txt
(in addition to inline comments).

As far I can tell, it is provably correct and provably linear by the number
of mounts we need to look at in order to decide what should be unmounted.
It even builds and seems to survive testing...

Another nice thing that fell out of that is that ->mnt_umounting is no longer
needed.

Compared to the first version:
	* explicit MNT_UMOUNT_CANDIDATE flag for is_candidate()
	* trim_ancestors() only clears that flag, leaving the suckers on list
	* trim_one() and handle_locked() take the stuff with flag cleared off
the list.  That allows to iterate with list_for_each_entry_safe() when calling
trim_one() - it removes at most one element from the list now.
	* no globals - I didn't bother with any kind of context, not worth it.

	* Notes updated accordingly; I have not touch the terms yet.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-06-29 18:13:41 -04:00

673 lines
17 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/pnode.c
*
* (C) Copyright IBM Corporation 2005.
* Author : Ram Pai (linuxram@us.ibm.com)
*/
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/nsproxy.h>
#include <uapi/linux/mount.h>
#include "internal.h"
#include "pnode.h"
/* return the next shared peer mount of @p */
static inline struct mount *next_peer(struct mount *p)
{
return list_entry(p->mnt_share.next, struct mount, mnt_share);
}
static inline struct mount *first_slave(struct mount *p)
{
return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}
static inline struct mount *next_slave(struct mount *p)
{
return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
static struct mount *get_peer_under_root(struct mount *mnt,
struct mnt_namespace *ns,
const struct path *root)
{
struct mount *m = mnt;
do {
/* Check the namespace first for optimization */
if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
return m;
m = next_peer(m);
} while (m != mnt);
return NULL;
}
/*
* Get ID of closest dominating peer group having a representative
* under the given root.
*
* Caller must hold namespace_sem
*/
int get_dominating_id(struct mount *mnt, const struct path *root)
{
struct mount *m;
for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
if (d)
return d->mnt_group_id;
}
return 0;
}
static int do_make_slave(struct mount *mnt)
{
struct mount *master, *slave_mnt;
if (list_empty(&mnt->mnt_share)) {
if (IS_MNT_SHARED(mnt)) {
mnt_release_group_id(mnt);
CLEAR_MNT_SHARED(mnt);
}
master = mnt->mnt_master;
if (!master) {
struct list_head *p = &mnt->mnt_slave_list;
while (!list_empty(p)) {
slave_mnt = list_first_entry(p,
struct mount, mnt_slave);
list_del_init(&slave_mnt->mnt_slave);
slave_mnt->mnt_master = NULL;
}
return 0;
}
} else {
struct mount *m;
/*
* slave 'mnt' to a peer mount that has the
* same root dentry. If none is available then
* slave it to anything that is available.
*/
for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
master = m;
break;
}
}
list_del_init(&mnt->mnt_share);
mnt->mnt_group_id = 0;
CLEAR_MNT_SHARED(mnt);
}
list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
slave_mnt->mnt_master = master;
list_move(&mnt->mnt_slave, &master->mnt_slave_list);
list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
mnt->mnt_master = master;
return 0;
}
/*
* vfsmount lock must be held for write
*/
void change_mnt_propagation(struct mount *mnt, int type)
{
if (type == MS_SHARED) {
set_mnt_shared(mnt);
return;
}
do_make_slave(mnt);
if (type != MS_SLAVE) {
list_del_init(&mnt->mnt_slave);
mnt->mnt_master = NULL;
if (type == MS_UNBINDABLE)
mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
else
mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
}
}
static struct mount *__propagation_next(struct mount *m,
struct mount *origin)
{
while (1) {
struct mount *master = m->mnt_master;
if (master == origin->mnt_master) {
struct mount *next = next_peer(m);
return (next == origin) ? NULL : next;
} else if (m->mnt_slave.next != &master->mnt_slave_list)
return next_slave(m);
/* back at master */
m = master;
}
}
/*
* get the next mount in the propagation tree.
* @m: the mount seen last
* @origin: the original mount from where the tree walk initiated
*
* Note that peer groups form contiguous segments of slave lists.
* We rely on that in get_source() to be able to find out if
* vfsmount found while iterating with propagation_next() is
* a peer of one we'd found earlier.
*/
static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
return __propagation_next(m, origin);
}
static struct mount *skip_propagation_subtree(struct mount *m,
struct mount *origin)
{
/*
* Advance m past everything that gets propagation from it.
*/
struct mount *p = __propagation_next(m, origin);
while (p && peers(m, p))
p = __propagation_next(p, origin);
return p;
}
static struct mount *next_group(struct mount *m, struct mount *origin)
{
while (1) {
while (1) {
struct mount *next;
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
if (next == origin)
return NULL;
} else if (m->mnt_slave.next != &next->mnt_slave)
break;
m = next;
}
/* m is the last peer */
while (1) {
struct mount *master = m->mnt_master;
if (m->mnt_slave.next != &master->mnt_slave_list)
return next_slave(m);
m = next_peer(master);
if (master->mnt_group_id == origin->mnt_group_id)
break;
if (master->mnt_slave.next == &m->mnt_slave)
break;
m = master;
}
if (m == origin)
return NULL;
}
}
/* all accesses are serialized by namespace_sem */
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct hlist_head *list;
static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
{
struct mount *child;
int type;
/* skip ones added by this propagate_mnt() */
if (IS_MNT_NEW(m))
return 0;
/* skip if mountpoint isn't visible in m */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
return 0;
/* skip if m is in the anon_ns */
if (is_anon_ns(m->mnt_ns))
return 0;
if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
bool done;
for (n = m; ; n = p) {
p = n->mnt_master;
if (p == dest_master || IS_MNT_MARKED(p))
break;
}
do {
struct mount *parent = last_source->mnt_parent;
if (peers(last_source, first_source))
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
break;
last_source = last_source->mnt_master;
} while (!done);
type = CL_SLAVE;
/* beginning of peer group among the slaves? */
if (IS_MNT_SHARED(m))
type |= CL_MAKE_SHARED;
}
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
read_seqlock_excl(&mount_lock);
mnt_set_mountpoint(m, dest_mp, child);
if (m->mnt_master != dest_master)
SET_MNT_MARK(m->mnt_master);
read_sequnlock_excl(&mount_lock);
last_dest = m;
last_source = child;
hlist_add_head(&child->mnt_hash, list);
return count_mounts(m->mnt_ns, child);
}
/*
* mount 'source_mnt' under the destination 'dest_mnt' at
* dentry 'dest_dentry'. And propagate that mount to
* all the peer and slave mounts of 'dest_mnt'.
* Link all the new mounts into a propagation tree headed at
* source_mnt. Also link all the new mounts using ->mnt_list
* headed at source_mnt's ->mnt_list
*
* @dest_mnt: destination mount.
* @dest_dentry: destination dentry.
* @source_mnt: source mount.
* @tree_list : list of heads of trees to be attached.
*/
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
struct mount *source_mnt, struct hlist_head *tree_list)
{
struct mount *m, *n;
int ret = 0;
/*
* we don't want to bother passing tons of arguments to
* propagate_one(); everything is serialized by namespace_sem,
* so globals will do just fine.
*/
last_dest = dest_mnt;
first_source = source_mnt;
last_source = source_mnt;
list = tree_list;
dest_master = dest_mnt->mnt_master;
/* all peers of dest_mnt, except dest_mnt itself */
for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
ret = propagate_one(n, dest_mp);
if (ret)
goto out;
}
/* all slave groups */
for (m = next_group(dest_mnt, dest_mnt); m;
m = next_group(m, dest_mnt)) {
/* everything in that slave group */
n = m;
do {
ret = propagate_one(n, dest_mp);
if (ret)
goto out;
n = next_peer(n);
} while (n != m);
}
out:
read_seqlock_excl(&mount_lock);
hlist_for_each_entry(n, tree_list, mnt_hash) {
m = n->mnt_parent;
if (m->mnt_master != dest_mnt->mnt_master)
CLEAR_MNT_MARK(m->mnt_master);
}
read_sequnlock_excl(&mount_lock);
return ret;
}
static struct mount *find_topper(struct mount *mnt)
{
/* If there is exactly one mount covering mnt completely return it. */
struct mount *child;
if (!list_is_singular(&mnt->mnt_mounts))
return NULL;
child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
if (child->mnt_mountpoint != mnt->mnt.mnt_root)
return NULL;
return child;
}
/*
* return true if the refcount is greater than count
*/
static inline int do_refcount_check(struct mount *mnt, int count)
{
return mnt_get_count(mnt) > count;
}
/**
* propagation_would_overmount - check whether propagation from @from
* would overmount @to
* @from: shared mount
* @to: mount to check
* @mp: future mountpoint of @to on @from
*
* If @from propagates mounts to @to, @from and @to must either be peers
* or one of the masters in the hierarchy of masters of @to must be a
* peer of @from.
*
* If the root of the @to mount is equal to the future mountpoint @mp of
* the @to mount on @from then @to will be overmounted by whatever is
* propagated to it.
*
* Context: This function expects namespace_lock() to be held and that
* @mp is stable.
* Return: If @from overmounts @to, true is returned, false if not.
*/
bool propagation_would_overmount(const struct mount *from,
const struct mount *to,
const struct mountpoint *mp)
{
if (!IS_MNT_SHARED(from))
return false;
if (to->mnt.mnt_root != mp->m_dentry)
return false;
for (const struct mount *m = to; m; m = m->mnt_master) {
if (peers(from, m))
return true;
}
return false;
}
/*
* check if the mount 'mnt' can be unmounted successfully.
* @mnt: the mount to be checked for unmount
* NOTE: unmounting 'mnt' would naturally propagate to all
* other mounts its parent propagates to.
* Check if any of these mounts that **do not have submounts**
* have more references than 'refcnt'. If so return busy.
*
* vfsmount lock must be held for write
*/
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
struct mount *m, *child, *topper;
struct mount *parent = mnt->mnt_parent;
if (mnt == parent)
return do_refcount_check(mnt, refcnt);
/*
* quickly check if the current mount can be unmounted.
* If not, we don't have to go checking for all other
* mounts
*/
if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
return 1;
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
int count = 1;
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (!child)
continue;
/* Is there exactly one mount on the child that covers
* it completely whose reference should be ignored?
*/
topper = find_topper(child);
if (topper)
count += 1;
else if (!list_empty(&child->mnt_mounts))
continue;
if (do_refcount_check(child, count))
return 1;
}
return 0;
}
/*
* Clear MNT_LOCKED when it can be shown to be safe.
*
* mount_lock lock must be held for write
*/
void propagate_mount_unlock(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m, *child;
BUG_ON(parent == mnt);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (child)
child->mnt.mnt_flags &= ~MNT_LOCKED;
}
}
static inline bool is_candidate(struct mount *m)
{
return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
}
static inline bool will_be_unmounted(struct mount *m)
{
return m->mnt.mnt_flags & MNT_UMOUNT;
}
static void umount_one(struct mount *m, struct list_head *to_umount)
{
m->mnt.mnt_flags |= MNT_UMOUNT;
list_del_init(&m->mnt_child);
move_from_ns(m, to_umount);
}
static void remove_from_candidate_list(struct mount *m)
{
m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
list_del_init(&m->mnt_list);
}
static void gather_candidates(struct list_head *set,
struct list_head *candidates)
{
struct mount *m, *p, *q;
list_for_each_entry(m, set, mnt_list) {
if (is_candidate(m))
continue;
m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
p = m->mnt_parent;
q = propagation_next(p, p);
while (q) {
struct mount *child = __lookup_mnt(&q->mnt,
m->mnt_mountpoint);
if (child) {
/*
* We might've already run into this one. That
* must've happened on earlier iteration of the
* outer loop; in that case we can skip those
* parents that get propagation from q - there
* will be nothing new on those as well.
*/
if (is_candidate(child)) {
q = skip_propagation_subtree(q, p);
continue;
}
child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
if (!will_be_unmounted(child))
list_add(&child->mnt_list, candidates);
}
q = propagation_next(q, p);
}
}
list_for_each_entry(m, set, mnt_list)
m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
}
/*
* We know that some child of @m can't be unmounted. In all places where the
* chain of descent of @m has child not overmounting the root of parent,
* the parent can't be unmounted either.
*/
static void trim_ancestors(struct mount *m)
{
struct mount *p;
for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts
return;
SET_MNT_MARK(m);
if (m != p->overmount)
p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
}
}
/*
* Find and exclude all umount candidates forbidden by @m
* (see Documentation/filesystems/propagate_umount.txt)
* If we can immediately tell that @m is OK to unmount (unlocked
* and all children are already committed to unmounting) commit
* to unmounting it.
* Only @m itself might be taken from the candidates list;
* anything found by trim_ancestors() is marked non-candidate
* and left on the list.
*/
static void trim_one(struct mount *m, struct list_head *to_umount)
{
bool remove_this = false, found = false, umount_this = false;
struct mount *n;
if (!is_candidate(m)) { // trim_ancestors() left it on list
remove_from_candidate_list(m);
return;
}
list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
if (!is_candidate(n)) {
found = true;
if (n != m->overmount) {
remove_this = true;
break;
}
}
}
if (found) {
trim_ancestors(m);
} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
remove_this = true;
umount_this = true;
}
if (remove_this) {
remove_from_candidate_list(m);
if (umount_this)
umount_one(m, to_umount);
}
}
static void handle_locked(struct mount *m, struct list_head *to_umount)
{
struct mount *cutoff = m, *p;
if (!is_candidate(m)) { // trim_ancestors() left it on list
remove_from_candidate_list(m);
return;
}
for (p = m; is_candidate(p); p = p->mnt_parent) {
remove_from_candidate_list(p);
if (!IS_MNT_LOCKED(p))
cutoff = p->mnt_parent;
}
if (will_be_unmounted(p))
cutoff = p;
while (m != cutoff) {
umount_one(m, to_umount);
m = m->mnt_parent;
}
}
/*
* @m is not to going away, and it overmounts the top of a stack of mounts
* that are going away. We know that all of those are fully overmounted
* by the one above (@m being the topmost of the chain), so @m can be slid
* in place where the bottom of the stack is attached.
*
* NOTE: here we temporarily violate a constraint - two mounts end up with
* the same parent and mountpoint; that will be remedied as soon as we
* return from propagate_umount() - its caller (umount_tree()) will detach
* the stack from the parent it (and now @m) is attached to. umount_tree()
* might choose to keep unmounted pieces stuck to each other, but it always
* detaches them from the mounts that remain in the tree.
*/
static void reparent(struct mount *m)
{
struct mount *p = m;
struct mountpoint *mp;
do {
mp = p->mnt_mp;
p = p->mnt_parent;
} while (will_be_unmounted(p));
mnt_change_mountpoint(p, mp, m);
mnt_notify_add(m);
}
/**
* propagate_umount - apply propagation rules to the set of mounts for umount()
* @set: the list of mounts to be unmounted.
*
* Collect all mounts that receive propagation from the mount in @set and have
* no obstacles to being unmounted. Add these additional mounts to the set.
*
* See Documentation/filesystems/propagate_umount.txt if you do anything in
* this area.
*
* Locks held:
* mount_lock (write_seqlock), namespace_sem (exclusive).
*/
void propagate_umount(struct list_head *set)
{
struct mount *m, *p;
LIST_HEAD(to_umount); // committed to unmounting
LIST_HEAD(candidates); // undecided umount candidates
// collect all candidates
gather_candidates(set, &candidates);
// reduce the set until it's non-shifting
list_for_each_entry_safe(m, p, &candidates, mnt_list)
trim_one(m, &to_umount);
// ... and non-revealing
while (!list_empty(&candidates)) {
m = list_first_entry(&candidates,struct mount, mnt_list);
handle_locked(m, &to_umount);
}
// now to_umount consists of all acceptable candidates
// deal with reparenting of remaining overmounts on those
list_for_each_entry(m, &to_umount, mnt_list) {
if (m->overmount)
reparent(m->overmount);
}
// and fold them into the set
list_splice_tail_init(&to_umount, set);
}