From ebcc78b555e792f1133e909dc03e21f5024c8ab3 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Wed, 22 May 2024 14:12:42 +0100 Subject: [PATCH] mm/mempolicy: Add MPOL_RANDOM To help work around certain memory controller limitations or similar, a random NUMA allocation memory policy is added. Signed-off-by: Tvrtko Ursulin --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 71 ++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..81a17d24a8e8 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,6 +24,7 @@ enum { MPOL_LOCAL, MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE, + MPOL_RANDOM, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78bead2326c5..9d42dc6807e5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -41,6 +41,9 @@ * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * + * random Allocate memory from a random node out of allowed set of + * nodes. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -592,6 +595,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, + [MPOL_RANDOM] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, @@ -1045,6 +1052,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: + case MPOL_RANDOM: *nodes = pol->nodes; break; case MPOL_LOCAL: @@ -2062,6 +2070,27 @@ static unsigned int interleave_nodes(struct mempolicy *policy) return nid; } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, nodemask_t *mask); + +static unsigned int random_nodes(struct mempolicy *policy) +{ + unsigned int nid = first_node(policy->nodes); + unsigned int cpuset_mems_cookie; + nodemask_t nodemask; + unsigned int r; + + r = get_random_u32_below(read_once_policy_nodemask(policy, &nodemask)); + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + while (r--) + nid = next_node_in(nid, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return nid; +} + /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. @@ -2107,6 +2136,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_LOCAL: return node; + case MPOL_RANDOM: + return random_nodes(policy); + default: BUG(); } @@ -2188,6 +2220,33 @@ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) return nid; } +static unsigned int random_nid(struct mempolicy *pol, + struct vm_area_struct *vma, + pgoff_t ilx) +{ + nodemask_t nodemask; + unsigned int r, nnodes; + int i, nid; + + nnodes = read_once_policy_nodemask(pol, &nodemask); + if (!nnodes) + return numa_node_id(); + + /* + * QQQ + * Can we say hash of vma+ilx is sufficiently random but still + * stable in case of reliance on stable, as it appears is with + * mpol_misplaced and interleaving? + */ + r = hash_long((unsigned long)vma + ilx, + ilog2(roundup_pow_of_two(nnodes))); + + nid = first_node(nodemask); + for (i = 0; i < r; i++) + nid = next_node(nid, nodemask); + return nid; +} + /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). @@ -2231,6 +2290,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, weighted_interleave_nodes(pol) : weighted_interleave_nid(pol, ilx); break; + case MPOL_RANDOM: + *nid = random_nodes(pol); + break; } return nodemask; @@ -2299,6 +2361,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: + case MPOL_RANDOM: *mask = mempolicy->nodes; break; @@ -2797,6 +2860,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: + case MPOL_RANDOM: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2988,6 +3052,10 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, polnid = zonelist_node_idx(z); break; + case MPOL_RANDOM: + polnid = random_nid(pol, vma, ilx); + break; + default: BUG(); } @@ -3333,6 +3401,7 @@ static const char * const policy_modes[] = [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", + [MPOL_RANDOM] = "random", }; /** @@ -3395,6 +3464,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) break; case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: + case MPOL_RANDOM: /* * Default to online nodes with memory if no nodelist */ @@ -3539,6 +3609,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: + case MPOL_RANDOM: nodes = pol->nodes; break; default: