mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-07 18:40:10 +00:00
Pull more kvm updates from Paolo Bonzini:
Generic:
- Clean up locking of all vCPUs for a VM by using the *_nest_lock()
family of functions, and move duplicated code to virt/kvm/. kernel/
patches acked by Peter Zijlstra
- Add MGLRU support to the access tracking perf test
ARM fixes:
- Make the irqbypass hooks resilient to changes in the GSI<->MSI
routing, avoiding behind stale vLPI mappings being left behind. The
fix is to resolve the VGIC IRQ using the host IRQ (which is stable)
and nuking the vLPI mapping upon a routing change
- Close another VGIC race where vCPU creation races with VGIC
creation, leading to in-flight vCPUs entering the kernel w/o
private IRQs allocated
- Fix a build issue triggered by the recently added workaround for
Ampere's AC04_CPU_23 erratum
- Correctly sign-extend the VA when emulating a TLBI instruction
potentially targeting a VNCR mapping
- Avoid dereferencing a NULL pointer in the VGIC debug code, which
can happen if the device doesn't have any mapping yet
s390:
- Fix interaction between some filesystems and Secure Execution
- Some cleanups and refactorings, preparing for an upcoming big
series
x86:
- Wait for target vCPU to ack KVM_REQ_UPDATE_PROTECTED_GUEST_STATE
to fix a race between AP destroy and VMRUN
- Decrypt and dump the VMSA in dump_vmcb() if debugging enabled for
the VM
- Refine and harden handling of spurious faults
- Add support for ALLOWED_SEV_FEATURES
- Add #VMGEXIT to the set of handlers special cased for
CONFIG_RETPOLINE=y
- Treat DEBUGCTL[5:2] as reserved to pave the way for virtualizing
features that utilize those bits
- Don't account temporary allocations in sev_send_update_data()
- Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM, via Bus Lock
Threshold
- Unify virtualization of IBRS on nested VM-Exit, and cross-vCPU
IBPB, between SVM and VMX
- Advertise support to userspace for WRMSRNS and PREFETCHI
- Rescan I/O APIC routes after handling EOI that needed to be
intercepted due to the old/previous routing, but not the
new/current routing
- Add a module param to control and enumerate support for device
posted interrupts
- Fix a potential overflow with nested virt on Intel systems running
32-bit kernels
- Flush shadow VMCSes on emergency reboot
- Add support for SNP to the various SEV selftests
- Add a selftest to verify fastops instructions via forced emulation
- Refine and optimize KVM's software processing of the posted
interrupt bitmap, and share the harvesting code between KVM and the
kernel's Posted MSI handler"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits)
rtmutex_api: provide correct extern functions
KVM: arm64: vgic-debug: Avoid dereferencing NULL ITE pointer
KVM: arm64: vgic-init: Plug vCPU vs. VGIC creation race
KVM: arm64: Unmap vLPIs affected by changes to GSI routing information
KVM: arm64: Resolve vLPI by host IRQ in vgic_v4_unset_forwarding()
KVM: arm64: Protect vLPI translation with vgic_irq::irq_lock
KVM: arm64: Use lock guard in vgic_v4_set_forwarding()
KVM: arm64: Mask out non-VA bits from TLBI VA* on VNCR invalidation
arm64: sysreg: Drag linux/kconfig.h to work around vdso build issue
KVM: s390: Simplify and move pv code
KVM: s390: Refactor and split some gmap helpers
KVM: s390: Remove unneeded srcu lock
s390: Remove unneeded includes
s390/uv: Improve splitting of large folios that cannot be split while dirty
s390/uv: Always return 0 from s390_wiggle_split_folio() if successful
s390/uv: Don't return 0 from make_hva_secure() if the operation was not successful
rust: add helper for mutex_trylock
RISC-V: KVM: use kvm_trylock_all_vcpus when locking all vCPUs
KVM: arm64: use kvm_trylock_all_vcpus when locking all vCPUs
x86: KVM: SVM: use kvm_lock_all_vcpus instead of a custom implementation
...
478 lines
12 KiB
C
478 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Page table allocation functions
|
|
*
|
|
* Copyright IBM Corp. 2016
|
|
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
|
|
*/
|
|
|
|
#include <linux/sysctl.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mm.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/page-states.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
unsigned long *crst_table_alloc(struct mm_struct *mm)
|
|
{
|
|
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
|
|
unsigned long *table;
|
|
|
|
if (!ptdesc)
|
|
return NULL;
|
|
table = ptdesc_to_virt(ptdesc);
|
|
__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
|
|
return table;
|
|
}
|
|
|
|
void crst_table_free(struct mm_struct *mm, unsigned long *table)
|
|
{
|
|
if (!table)
|
|
return;
|
|
pagetable_free(virt_to_ptdesc(table));
|
|
}
|
|
|
|
static void __crst_table_upgrade(void *arg)
|
|
{
|
|
struct mm_struct *mm = arg;
|
|
struct ctlreg asce;
|
|
|
|
/* change all active ASCEs to avoid the creation of new TLBs */
|
|
if (current->active_mm == mm) {
|
|
asce.val = mm->context.asce;
|
|
get_lowcore()->user_asce = asce;
|
|
local_ctl_load(7, &asce);
|
|
if (!test_thread_flag(TIF_ASCE_PRIMARY))
|
|
local_ctl_load(1, &asce);
|
|
}
|
|
__tlb_flush_local();
|
|
}
|
|
|
|
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
|
|
{
|
|
unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
|
|
unsigned long asce_limit = mm->context.asce_limit;
|
|
|
|
mmap_assert_write_locked(mm);
|
|
|
|
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
|
|
VM_BUG_ON(asce_limit < _REGION2_SIZE);
|
|
|
|
if (end <= asce_limit)
|
|
return 0;
|
|
|
|
if (asce_limit == _REGION2_SIZE) {
|
|
p4d = crst_table_alloc(mm);
|
|
if (unlikely(!p4d))
|
|
goto err_p4d;
|
|
crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
|
|
pagetable_p4d_ctor(virt_to_ptdesc(p4d));
|
|
}
|
|
if (end > _REGION1_SIZE) {
|
|
pgd = crst_table_alloc(mm);
|
|
if (unlikely(!pgd))
|
|
goto err_pgd;
|
|
crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
|
|
pagetable_pgd_ctor(virt_to_ptdesc(pgd));
|
|
}
|
|
|
|
spin_lock_bh(&mm->page_table_lock);
|
|
|
|
if (p4d) {
|
|
__pgd = (unsigned long *) mm->pgd;
|
|
p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
|
|
mm->pgd = (pgd_t *) p4d;
|
|
mm->context.asce_limit = _REGION1_SIZE;
|
|
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
|
|
_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
|
|
mm_inc_nr_puds(mm);
|
|
}
|
|
if (pgd) {
|
|
__pgd = (unsigned long *) mm->pgd;
|
|
pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
|
|
mm->pgd = (pgd_t *) pgd;
|
|
mm->context.asce_limit = TASK_SIZE_MAX;
|
|
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
|
|
_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
|
|
}
|
|
|
|
spin_unlock_bh(&mm->page_table_lock);
|
|
|
|
on_each_cpu(__crst_table_upgrade, mm, 0);
|
|
|
|
return 0;
|
|
|
|
err_pgd:
|
|
pagetable_dtor(virt_to_ptdesc(p4d));
|
|
crst_table_free(mm, p4d);
|
|
err_p4d:
|
|
return -ENOMEM;
|
|
}
|
|
|
|
#ifdef CONFIG_PGSTE
|
|
|
|
struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm)
|
|
{
|
|
struct ptdesc *ptdesc;
|
|
u64 *table;
|
|
|
|
ptdesc = pagetable_alloc(GFP_KERNEL, 0);
|
|
if (ptdesc) {
|
|
table = (u64 *)ptdesc_to_virt(ptdesc);
|
|
__arch_set_page_dat(table, 1);
|
|
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
|
|
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
|
|
}
|
|
return ptdesc;
|
|
}
|
|
|
|
void page_table_free_pgste(struct ptdesc *ptdesc)
|
|
{
|
|
pagetable_free(ptdesc);
|
|
}
|
|
|
|
#endif /* CONFIG_PGSTE */
|
|
|
|
unsigned long *page_table_alloc(struct mm_struct *mm)
|
|
{
|
|
struct ptdesc *ptdesc;
|
|
unsigned long *table;
|
|
|
|
ptdesc = pagetable_alloc(GFP_KERNEL, 0);
|
|
if (!ptdesc)
|
|
return NULL;
|
|
if (!pagetable_pte_ctor(mm, ptdesc)) {
|
|
pagetable_free(ptdesc);
|
|
return NULL;
|
|
}
|
|
table = ptdesc_to_virt(ptdesc);
|
|
__arch_set_page_dat(table, 1);
|
|
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
|
|
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
|
|
return table;
|
|
}
|
|
|
|
void page_table_free(struct mm_struct *mm, unsigned long *table)
|
|
{
|
|
struct ptdesc *ptdesc = virt_to_ptdesc(table);
|
|
|
|
pagetable_dtor_free(ptdesc);
|
|
}
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
static void pte_free_now(struct rcu_head *head)
|
|
{
|
|
struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
|
|
|
|
pagetable_dtor_free(ptdesc);
|
|
}
|
|
|
|
void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
|
|
{
|
|
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
|
|
|
|
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
|
|
/*
|
|
* THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
|
|
* Turn to the generic pte_free_defer() version once gmap is removed.
|
|
*/
|
|
WARN_ON_ONCE(mm_has_pgste(mm));
|
|
}
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
/*
|
|
* Base infrastructure required to generate basic asces, region, segment,
|
|
* and page tables that do not make use of enhanced features like EDAT1.
|
|
*/
|
|
|
|
static struct kmem_cache *base_pgt_cache;
|
|
|
|
static unsigned long *base_pgt_alloc(void)
|
|
{
|
|
unsigned long *table;
|
|
|
|
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
|
|
if (table)
|
|
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
|
|
return table;
|
|
}
|
|
|
|
static void base_pgt_free(unsigned long *table)
|
|
{
|
|
kmem_cache_free(base_pgt_cache, table);
|
|
}
|
|
|
|
static unsigned long *base_crst_alloc(unsigned long val)
|
|
{
|
|
unsigned long *table;
|
|
struct ptdesc *ptdesc;
|
|
|
|
ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
|
|
if (!ptdesc)
|
|
return NULL;
|
|
table = ptdesc_address(ptdesc);
|
|
crst_table_init(table, val);
|
|
return table;
|
|
}
|
|
|
|
static void base_crst_free(unsigned long *table)
|
|
{
|
|
if (!table)
|
|
return;
|
|
pagetable_free(virt_to_ptdesc(table));
|
|
}
|
|
|
|
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
|
|
static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
|
|
unsigned long end) \
|
|
{ \
|
|
unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
|
|
\
|
|
return (next - 1) < (end - 1) ? next : end; \
|
|
}
|
|
|
|
BASE_ADDR_END_FUNC(page, PAGE_SIZE)
|
|
BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
|
|
BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
|
|
BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
|
|
BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
|
|
|
|
static inline unsigned long base_lra(unsigned long address)
|
|
{
|
|
unsigned long real;
|
|
|
|
asm volatile(
|
|
" lra %0,0(%1)\n"
|
|
: "=d" (real) : "a" (address) : "cc");
|
|
return real;
|
|
}
|
|
|
|
static int base_page_walk(unsigned long *origin, unsigned long addr,
|
|
unsigned long end, int alloc)
|
|
{
|
|
unsigned long *pte, next;
|
|
|
|
if (!alloc)
|
|
return 0;
|
|
pte = origin;
|
|
pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
|
|
do {
|
|
next = base_page_addr_end(addr, end);
|
|
*pte = base_lra(addr);
|
|
} while (pte++, addr = next, addr < end);
|
|
return 0;
|
|
}
|
|
|
|
static int base_segment_walk(unsigned long *origin, unsigned long addr,
|
|
unsigned long end, int alloc)
|
|
{
|
|
unsigned long *ste, next, *table;
|
|
int rc;
|
|
|
|
ste = origin;
|
|
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
|
|
do {
|
|
next = base_segment_addr_end(addr, end);
|
|
if (*ste & _SEGMENT_ENTRY_INVALID) {
|
|
if (!alloc)
|
|
continue;
|
|
table = base_pgt_alloc();
|
|
if (!table)
|
|
return -ENOMEM;
|
|
*ste = __pa(table) | _SEGMENT_ENTRY;
|
|
}
|
|
table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
|
|
rc = base_page_walk(table, addr, next, alloc);
|
|
if (rc)
|
|
return rc;
|
|
if (!alloc)
|
|
base_pgt_free(table);
|
|
cond_resched();
|
|
} while (ste++, addr = next, addr < end);
|
|
return 0;
|
|
}
|
|
|
|
static int base_region3_walk(unsigned long *origin, unsigned long addr,
|
|
unsigned long end, int alloc)
|
|
{
|
|
unsigned long *rtte, next, *table;
|
|
int rc;
|
|
|
|
rtte = origin;
|
|
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
|
|
do {
|
|
next = base_region3_addr_end(addr, end);
|
|
if (*rtte & _REGION_ENTRY_INVALID) {
|
|
if (!alloc)
|
|
continue;
|
|
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
|
|
if (!table)
|
|
return -ENOMEM;
|
|
*rtte = __pa(table) | _REGION3_ENTRY;
|
|
}
|
|
table = __va(*rtte & _REGION_ENTRY_ORIGIN);
|
|
rc = base_segment_walk(table, addr, next, alloc);
|
|
if (rc)
|
|
return rc;
|
|
if (!alloc)
|
|
base_crst_free(table);
|
|
} while (rtte++, addr = next, addr < end);
|
|
return 0;
|
|
}
|
|
|
|
static int base_region2_walk(unsigned long *origin, unsigned long addr,
|
|
unsigned long end, int alloc)
|
|
{
|
|
unsigned long *rste, next, *table;
|
|
int rc;
|
|
|
|
rste = origin;
|
|
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
|
|
do {
|
|
next = base_region2_addr_end(addr, end);
|
|
if (*rste & _REGION_ENTRY_INVALID) {
|
|
if (!alloc)
|
|
continue;
|
|
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
|
|
if (!table)
|
|
return -ENOMEM;
|
|
*rste = __pa(table) | _REGION2_ENTRY;
|
|
}
|
|
table = __va(*rste & _REGION_ENTRY_ORIGIN);
|
|
rc = base_region3_walk(table, addr, next, alloc);
|
|
if (rc)
|
|
return rc;
|
|
if (!alloc)
|
|
base_crst_free(table);
|
|
} while (rste++, addr = next, addr < end);
|
|
return 0;
|
|
}
|
|
|
|
static int base_region1_walk(unsigned long *origin, unsigned long addr,
|
|
unsigned long end, int alloc)
|
|
{
|
|
unsigned long *rfte, next, *table;
|
|
int rc;
|
|
|
|
rfte = origin;
|
|
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
|
|
do {
|
|
next = base_region1_addr_end(addr, end);
|
|
if (*rfte & _REGION_ENTRY_INVALID) {
|
|
if (!alloc)
|
|
continue;
|
|
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
|
|
if (!table)
|
|
return -ENOMEM;
|
|
*rfte = __pa(table) | _REGION1_ENTRY;
|
|
}
|
|
table = __va(*rfte & _REGION_ENTRY_ORIGIN);
|
|
rc = base_region2_walk(table, addr, next, alloc);
|
|
if (rc)
|
|
return rc;
|
|
if (!alloc)
|
|
base_crst_free(table);
|
|
} while (rfte++, addr = next, addr < end);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* base_asce_free - free asce and tables returned from base_asce_alloc()
|
|
* @asce: asce to be freed
|
|
*
|
|
* Frees all region, segment, and page tables that were allocated with a
|
|
* corresponding base_asce_alloc() call.
|
|
*/
|
|
void base_asce_free(unsigned long asce)
|
|
{
|
|
unsigned long *table = __va(asce & _ASCE_ORIGIN);
|
|
|
|
if (!asce)
|
|
return;
|
|
switch (asce & _ASCE_TYPE_MASK) {
|
|
case _ASCE_TYPE_SEGMENT:
|
|
base_segment_walk(table, 0, _REGION3_SIZE, 0);
|
|
break;
|
|
case _ASCE_TYPE_REGION3:
|
|
base_region3_walk(table, 0, _REGION2_SIZE, 0);
|
|
break;
|
|
case _ASCE_TYPE_REGION2:
|
|
base_region2_walk(table, 0, _REGION1_SIZE, 0);
|
|
break;
|
|
case _ASCE_TYPE_REGION1:
|
|
base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
|
|
break;
|
|
}
|
|
base_crst_free(table);
|
|
}
|
|
|
|
static int base_pgt_cache_init(void)
|
|
{
|
|
static DEFINE_MUTEX(base_pgt_cache_mutex);
|
|
unsigned long sz = _PAGE_TABLE_SIZE;
|
|
|
|
if (base_pgt_cache)
|
|
return 0;
|
|
mutex_lock(&base_pgt_cache_mutex);
|
|
if (!base_pgt_cache)
|
|
base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
|
|
mutex_unlock(&base_pgt_cache_mutex);
|
|
return base_pgt_cache ? 0 : -ENOMEM;
|
|
}
|
|
|
|
/**
|
|
* base_asce_alloc - create kernel mapping without enhanced DAT features
|
|
* @addr: virtual start address of kernel mapping
|
|
* @num_pages: number of consecutive pages
|
|
*
|
|
* Generate an asce, including all required region, segment and page tables,
|
|
* that can be used to access the virtual kernel mapping. The difference is
|
|
* that the returned asce does not make use of any enhanced DAT features like
|
|
* e.g. large pages. This is required for some I/O functions that pass an
|
|
* asce, like e.g. some service call requests.
|
|
*
|
|
* Note: the returned asce may NEVER be attached to any cpu. It may only be
|
|
* used for I/O requests. tlb entries that might result because the
|
|
* asce was attached to a cpu won't be cleared.
|
|
*/
|
|
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
|
|
{
|
|
unsigned long asce, *table, end;
|
|
int rc;
|
|
|
|
if (base_pgt_cache_init())
|
|
return 0;
|
|
end = addr + num_pages * PAGE_SIZE;
|
|
if (end <= _REGION3_SIZE) {
|
|
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
|
|
if (!table)
|
|
return 0;
|
|
rc = base_segment_walk(table, addr, end, 1);
|
|
asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
|
|
} else if (end <= _REGION2_SIZE) {
|
|
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
|
|
if (!table)
|
|
return 0;
|
|
rc = base_region3_walk(table, addr, end, 1);
|
|
asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
|
|
} else if (end <= _REGION1_SIZE) {
|
|
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
|
|
if (!table)
|
|
return 0;
|
|
rc = base_region2_walk(table, addr, end, 1);
|
|
asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
|
|
} else {
|
|
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
|
|
if (!table)
|
|
return 0;
|
|
rc = base_region1_walk(table, addr, end, 1);
|
|
asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
|
|
}
|
|
if (rc) {
|
|
base_asce_free(asce);
|
|
asce = 0;
|
|
}
|
|
return asce;
|
|
}
|