Files
linux/drivers/gpu/drm/xe/xe_tile.c
Francois Dugast 91b2c42c21 drm/xe: Use fault injection infrastructure to find issues at probe time
The kernel fault injection infrastructure is used to test proper error
handling during probe. The return code of the functions using
ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by
tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION
(among others).

One way to use fault injection at probe time by making each of those
functions fail one at a time is:

    FAILTYPE=fail_function
    DEVICE="0000:00:08.0" # depends on the system
    ERRNO=-12 # -ENOMEM, can depend on the function

    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
    echo -1 > /sys/kernel/debug/$FAILTYPE/times
    echo 0 > /sys/kernel/debug/$FAILTYPE/space
    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose

    modprobe xe
    echo $DEVICE > /sys/bus/pci/drivers/xe/unbind

    grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
    cut -d ' ' -f 1 | while read -r FUNCTION ; do
        echo "Injecting fault in $FUNCTION"
        echo "" > /sys/kernel/debug/$FAILTYPE/inject
        echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
        printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
        echo $DEVICE > /sys/bus/pci/drivers/xe/bind
    done

    rmmod xe

It will also be integrated into IGT for systematic execution by CI.

v2: Wrappers are not needed in the cases covered by this patch, so
    remove them and use ALLOW_ERROR_INJECTION() directly.

v3: Document the use of fault injection at probe time in xe_pci_probe
    and refer to it where ALLOW_ERROR_INJECTION() is used.

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240927151207.399354-1-francois.dugast@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
2024-10-03 08:58:26 -04:00

188 lines
5.7 KiB
C

// SPDX-License-Identifier: MIT
/*
* Copyright © 2023 Intel Corporation
*/
#include <linux/fault-inject.h>
#include <drm/drm_managed.h>
#include "xe_device.h"
#include "xe_ggtt.h"
#include "xe_gt.h"
#include "xe_migrate.h"
#include "xe_pcode.h"
#include "xe_sa.h"
#include "xe_tile.h"
#include "xe_tile_sysfs.h"
#include "xe_ttm_vram_mgr.h"
#include "xe_wa.h"
/**
* DOC: Multi-tile Design
*
* Different vendors use the term "tile" a bit differently, but in the Intel
* world, a 'tile' is pretty close to what most people would think of as being
* a complete GPU. When multiple GPUs are placed behind a single PCI device,
* that's what is referred to as a "multi-tile device." In such cases, pretty
* much all hardware is replicated per-tile, although certain responsibilities
* like PCI communication, reporting of interrupts to the OS, etc. are handled
* solely by the "root tile." A multi-tile platform takes care of tying the
* tiles together in a way such that interrupt notifications from remote tiles
* are forwarded to the root tile, the per-tile vram is combined into a single
* address space, etc.
*
* In contrast, a "GT" (which officially stands for "Graphics Technology") is
* the subset of a GPU/tile that is responsible for implementing graphics
* and/or media operations. The GT is where a lot of the driver implementation
* happens since it's where the hardware engines, the execution units, and the
* GuC all reside.
*
* Historically most Intel devices were single-tile devices that contained a
* single GT. PVC is an example of an Intel platform built on a multi-tile
* design (i.e., multiple GPUs behind a single PCI device); each PVC tile only
* has a single GT. In contrast, platforms like MTL that have separate chips
* for render and media IP are still only a single logical GPU, but the
* graphics and media IP blocks are each exposed as a separate GT within that
* single GPU. This is important from a software perspective because multi-GT
* platforms like MTL only replicate a subset of the GPU hardware and behave
* differently than multi-tile platforms like PVC where nearly everything is
* replicated.
*
* Per-tile functionality (shared by all GTs within the tile):
* - Complete 4MB MMIO space (containing SGunit/SoC registers, GT
* registers, display registers, etc.)
* - Global GTT
* - VRAM (if discrete)
* - Interrupt flows
* - Migration context
* - kernel batchbuffer pool
* - Primary GT
* - Media GT (if media version >= 13)
*
* Per-GT functionality:
* - GuC
* - Hardware engines
* - Programmable hardware units (subslices, EUs)
* - GSI subset of registers (multiple copies of these registers reside
* within the complete MMIO space provided by the tile, but at different
* offsets --- 0 for render, 0x380000 for media)
* - Multicast register steering
* - TLBs to cache page table translations
* - Reset capability
* - Low-level power management (e.g., C6)
* - Clock frequency
* - MOCS and PAT programming
*/
/**
* xe_tile_alloc - Perform per-tile memory allocation
* @tile: Tile to perform allocations for
*
* Allocates various per-tile data structures using DRM-managed allocations.
* Does not touch the hardware.
*
* Returns -ENOMEM if allocations fail, otherwise 0.
*/
static int xe_tile_alloc(struct xe_tile *tile)
{
struct drm_device *drm = &tile_to_xe(tile)->drm;
tile->mem.ggtt = drmm_kzalloc(drm, sizeof(*tile->mem.ggtt),
GFP_KERNEL);
if (!tile->mem.ggtt)
return -ENOMEM;
tile->mem.ggtt->tile = tile;
tile->mem.vram_mgr = drmm_kzalloc(drm, sizeof(*tile->mem.vram_mgr), GFP_KERNEL);
if (!tile->mem.vram_mgr)
return -ENOMEM;
return 0;
}
/**
* xe_tile_init_early - Initialize the tile and primary GT
* @tile: Tile to initialize
* @xe: Parent Xe device
* @id: Tile ID
*
* Initializes per-tile resources that don't require any interactions with the
* hardware or any knowledge about the Graphics/Media IP version.
*
* Returns: 0 on success, negative error code on error.
*/
int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
{
int err;
tile->xe = xe;
tile->id = id;
err = xe_tile_alloc(tile);
if (err)
return err;
tile->primary_gt = xe_gt_alloc(tile);
if (IS_ERR(tile->primary_gt))
return PTR_ERR(tile->primary_gt);
xe_pcode_init(tile);
return 0;
}
ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO); /* See xe_pci_probe() */
static int tile_ttm_mgr_init(struct xe_tile *tile)
{
struct xe_device *xe = tile_to_xe(tile);
int err;
if (tile->mem.vram.usable_size) {
err = xe_ttm_vram_mgr_init(tile, tile->mem.vram_mgr);
if (err)
return err;
xe->info.mem_region_mask |= BIT(tile->id) << 1;
}
return 0;
}
/**
* xe_tile_init_noalloc - Init tile up to the point where allocations can happen.
* @tile: The tile to initialize.
*
* This function prepares the tile to allow memory allocations to VRAM, but is
* not allowed to allocate memory itself. This state is useful for display
* readout, because the inherited display framebuffer will otherwise be
* overwritten as it is usually put at the start of VRAM.
*
* Note that since this is tile initialization, it should not perform any
* GT-specific operations, and thus does not need to hold GT forcewake.
*
* Returns: 0 on success, negative error code on error.
*/
int xe_tile_init_noalloc(struct xe_tile *tile)
{
int err;
err = tile_ttm_mgr_init(tile);
if (err)
return err;
tile->mem.kernel_bb_pool = xe_sa_bo_manager_init(tile, SZ_1M, 16);
if (IS_ERR(tile->mem.kernel_bb_pool))
return PTR_ERR(tile->mem.kernel_bb_pool);
xe_wa_apply_tile_workarounds(tile);
err = xe_tile_sysfs_init(tile);
return 0;
}
void xe_tile_migrate_wait(struct xe_tile *tile)
{
xe_migrate_wait(tile->migrate);
}