mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-08 02:49:48 +00:00
drm/xe: Introduce the wedged_mode debugfs
So, the wedged mode can be selected per device at runtime,
before the tests or before reproducing the issue.
v2: - s/busted/wedged
- some locking consistency
v3: - remove mutex
- toggle guc reset policy on any mode change
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-4-rodrigo.vivi@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
This commit is contained in:
@@ -12,6 +12,8 @@
|
|||||||
#include "xe_bo.h"
|
#include "xe_bo.h"
|
||||||
#include "xe_device.h"
|
#include "xe_device.h"
|
||||||
#include "xe_gt_debugfs.h"
|
#include "xe_gt_debugfs.h"
|
||||||
|
#include "xe_gt_printk.h"
|
||||||
|
#include "xe_guc_ads.h"
|
||||||
#include "xe_pm.h"
|
#include "xe_pm.h"
|
||||||
#include "xe_sriov.h"
|
#include "xe_sriov.h"
|
||||||
#include "xe_step.h"
|
#include "xe_step.h"
|
||||||
@@ -117,6 +119,56 @@ static const struct file_operations forcewake_all_fops = {
|
|||||||
.release = forcewake_release,
|
.release = forcewake_release,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
|
||||||
|
size_t size, loff_t *pos)
|
||||||
|
{
|
||||||
|
struct xe_device *xe = file_inode(f)->i_private;
|
||||||
|
char buf[32];
|
||||||
|
int len = 0;
|
||||||
|
|
||||||
|
len = scnprintf(buf, sizeof(buf), "%d\n", xe->wedged.mode);
|
||||||
|
|
||||||
|
return simple_read_from_buffer(ubuf, size, pos, buf, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
|
||||||
|
size_t size, loff_t *pos)
|
||||||
|
{
|
||||||
|
struct xe_device *xe = file_inode(f)->i_private;
|
||||||
|
struct xe_gt *gt;
|
||||||
|
u32 wedged_mode;
|
||||||
|
ssize_t ret;
|
||||||
|
u8 id;
|
||||||
|
|
||||||
|
ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
if (wedged_mode > 2)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (xe->wedged.mode == wedged_mode)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
xe->wedged.mode = wedged_mode;
|
||||||
|
|
||||||
|
for_each_gt(gt, xe, id) {
|
||||||
|
ret = xe_guc_ads_scheduler_policy_toggle_reset(>->uc.guc.ads);
|
||||||
|
if (ret) {
|
||||||
|
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
|
||||||
|
return -EIO;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct file_operations wedged_mode_fops = {
|
||||||
|
.owner = THIS_MODULE,
|
||||||
|
.read = wedged_mode_show,
|
||||||
|
.write = wedged_mode_set,
|
||||||
|
};
|
||||||
|
|
||||||
void xe_debugfs_register(struct xe_device *xe)
|
void xe_debugfs_register(struct xe_device *xe)
|
||||||
{
|
{
|
||||||
struct ttm_device *bdev = &xe->ttm;
|
struct ttm_device *bdev = &xe->ttm;
|
||||||
@@ -134,6 +186,9 @@ void xe_debugfs_register(struct xe_device *xe)
|
|||||||
debugfs_create_file("forcewake_all", 0400, root, xe,
|
debugfs_create_file("forcewake_all", 0400, root, xe,
|
||||||
&forcewake_all_fops);
|
&forcewake_all_fops);
|
||||||
|
|
||||||
|
debugfs_create_file("wedged_mode", 0400, root, xe,
|
||||||
|
&wedged_mode_fops);
|
||||||
|
|
||||||
for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
|
for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
|
||||||
man = ttm_manager_type(bdev, mem_type);
|
man = ttm_manager_type(bdev, mem_type);
|
||||||
|
|
||||||
|
|||||||
@@ -506,6 +506,8 @@ int xe_device_probe_early(struct xe_device *xe)
|
|||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
xe->wedged.mode = xe_modparam.wedged_mode;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -769,7 +771,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
|
|||||||
* xe_device_declare_wedged - Declare device wedged
|
* xe_device_declare_wedged - Declare device wedged
|
||||||
* @xe: xe device instance
|
* @xe: xe device instance
|
||||||
*
|
*
|
||||||
* This is a final state that can only be cleared with a module
|
* This is a final state that can only be cleared with a mudule
|
||||||
* re-probe (unbind + bind).
|
* re-probe (unbind + bind).
|
||||||
* In this state every IOCTL will be blocked so the GT cannot be used.
|
* In this state every IOCTL will be blocked so the GT cannot be used.
|
||||||
* In general it will be called upon any critical error such as gt reset
|
* In general it will be called upon any critical error such as gt reset
|
||||||
@@ -781,10 +783,12 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
|
|||||||
*/
|
*/
|
||||||
void xe_device_declare_wedged(struct xe_device *xe)
|
void xe_device_declare_wedged(struct xe_device *xe)
|
||||||
{
|
{
|
||||||
if (xe_modparam.wedged_mode == 0)
|
if (xe->wedged.mode == 0) {
|
||||||
|
drm_dbg(&xe->drm, "Wedged mode is forcebly disabled\n");
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!atomic_xchg(&xe->wedged, 1)) {
|
if (!atomic_xchg(&xe->wedged.flag, 1)) {
|
||||||
xe->needs_flr_on_fini = true;
|
xe->needs_flr_on_fini = true;
|
||||||
drm_err(&xe->drm,
|
drm_err(&xe->drm,
|
||||||
"CRITICAL: Xe has declared device %s as wedged.\n"
|
"CRITICAL: Xe has declared device %s as wedged.\n"
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
|
|||||||
|
|
||||||
static inline bool xe_device_wedged(struct xe_device *xe)
|
static inline bool xe_device_wedged(struct xe_device *xe)
|
||||||
{
|
{
|
||||||
return atomic_read(&xe->wedged);
|
return atomic_read(&xe->wedged.flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
void xe_device_declare_wedged(struct xe_device *xe);
|
void xe_device_declare_wedged(struct xe_device *xe);
|
||||||
|
|||||||
@@ -459,8 +459,13 @@ struct xe_device {
|
|||||||
/** @needs_flr_on_fini: requests function-reset on fini */
|
/** @needs_flr_on_fini: requests function-reset on fini */
|
||||||
bool needs_flr_on_fini;
|
bool needs_flr_on_fini;
|
||||||
|
|
||||||
/** @wedged: Xe device faced a critical error and is now blocked. */
|
/** @wedged: Struct to control Wedged States and mode */
|
||||||
atomic_t wedged;
|
struct {
|
||||||
|
/** @wedged.flag: Xe device faced a critical error and is now blocked. */
|
||||||
|
atomic_t flag;
|
||||||
|
/** @wedged.mode: Mode controlled by kernel parameter and debugfs */
|
||||||
|
int mode;
|
||||||
|
} wedged;
|
||||||
|
|
||||||
/* private: */
|
/* private: */
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include <generated/xe_wa_oob.h>
|
#include <generated/xe_wa_oob.h>
|
||||||
|
|
||||||
|
#include "abi/guc_actions_abi.h"
|
||||||
#include "regs/xe_engine_regs.h"
|
#include "regs/xe_engine_regs.h"
|
||||||
#include "regs/xe_gt_regs.h"
|
#include "regs/xe_gt_regs.h"
|
||||||
#include "regs/xe_guc_regs.h"
|
#include "regs/xe_guc_regs.h"
|
||||||
@@ -16,11 +17,11 @@
|
|||||||
#include "xe_gt.h"
|
#include "xe_gt.h"
|
||||||
#include "xe_gt_ccs_mode.h"
|
#include "xe_gt_ccs_mode.h"
|
||||||
#include "xe_guc.h"
|
#include "xe_guc.h"
|
||||||
|
#include "xe_guc_ct.h"
|
||||||
#include "xe_hw_engine.h"
|
#include "xe_hw_engine.h"
|
||||||
#include "xe_lrc.h"
|
#include "xe_lrc.h"
|
||||||
#include "xe_map.h"
|
#include "xe_map.h"
|
||||||
#include "xe_mmio.h"
|
#include "xe_mmio.h"
|
||||||
#include "xe_module.h"
|
|
||||||
#include "xe_platform_types.h"
|
#include "xe_platform_types.h"
|
||||||
#include "xe_wa.h"
|
#include "xe_wa.h"
|
||||||
|
|
||||||
@@ -441,6 +442,7 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
|
|||||||
|
|
||||||
static void guc_policies_init(struct xe_guc_ads *ads)
|
static void guc_policies_init(struct xe_guc_ads *ads)
|
||||||
{
|
{
|
||||||
|
struct xe_device *xe = ads_to_xe(ads);
|
||||||
u32 global_flags = 0;
|
u32 global_flags = 0;
|
||||||
|
|
||||||
ads_blob_write(ads, policies.dpc_promote_time,
|
ads_blob_write(ads, policies.dpc_promote_time,
|
||||||
@@ -448,7 +450,7 @@ static void guc_policies_init(struct xe_guc_ads *ads)
|
|||||||
ads_blob_write(ads, policies.max_num_work_items,
|
ads_blob_write(ads, policies.max_num_work_items,
|
||||||
GLOBAL_POLICY_MAX_NUM_WI);
|
GLOBAL_POLICY_MAX_NUM_WI);
|
||||||
|
|
||||||
if (xe_modparam.wedged_mode == 2)
|
if (xe->wedged.mode == 2)
|
||||||
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||||
|
|
||||||
ads_blob_write(ads, policies.global_flags, global_flags);
|
ads_blob_write(ads, policies.global_flags, global_flags);
|
||||||
@@ -806,3 +808,57 @@ void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads)
|
|||||||
{
|
{
|
||||||
guc_populate_golden_lrc(ads);
|
guc_populate_golden_lrc(ads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset)
|
||||||
|
{
|
||||||
|
struct xe_guc_ct *ct = &ads_to_guc(ads)->ct;
|
||||||
|
u32 action[] = {
|
||||||
|
XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE,
|
||||||
|
policy_offset
|
||||||
|
};
|
||||||
|
|
||||||
|
return xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
|
||||||
|
* @ads: Additional data structures object
|
||||||
|
*
|
||||||
|
* This function update the GuC's engine reset policy based on wedged.mode.
|
||||||
|
*
|
||||||
|
* Return: 0 on success, and negative error code otherwise.
|
||||||
|
*/
|
||||||
|
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
|
||||||
|
{
|
||||||
|
struct xe_device *xe = ads_to_xe(ads);
|
||||||
|
struct xe_gt *gt = ads_to_gt(ads);
|
||||||
|
struct xe_tile *tile = gt_to_tile(gt);
|
||||||
|
struct guc_policies *policies;
|
||||||
|
struct xe_bo *bo;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
policies = kmalloc(sizeof(*policies), GFP_KERNEL);
|
||||||
|
if (!policies)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
|
||||||
|
policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
|
||||||
|
policies->is_valid = 1;
|
||||||
|
if (xe->wedged.mode == 2)
|
||||||
|
policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||||
|
else
|
||||||
|
policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||||
|
|
||||||
|
bo = xe_managed_bo_create_from_data(xe, tile, policies, sizeof(struct guc_policies),
|
||||||
|
XE_BO_FLAG_VRAM_IF_DGFX(tile) |
|
||||||
|
XE_BO_FLAG_GGTT);
|
||||||
|
if (IS_ERR(bo)) {
|
||||||
|
ret = PTR_ERR(bo);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = guc_ads_action_update_policies(ads, xe_bo_ggtt_addr(bo));
|
||||||
|
out:
|
||||||
|
kfree(policies);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|||||||
@@ -13,5 +13,6 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
|
|||||||
void xe_guc_ads_populate(struct xe_guc_ads *ads);
|
void xe_guc_ads_populate(struct xe_guc_ads *ads);
|
||||||
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
|
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
|
||||||
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
|
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
|
||||||
|
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -35,7 +35,6 @@
|
|||||||
#include "xe_macros.h"
|
#include "xe_macros.h"
|
||||||
#include "xe_map.h"
|
#include "xe_map.h"
|
||||||
#include "xe_mocs.h"
|
#include "xe_mocs.h"
|
||||||
#include "xe_module.h"
|
|
||||||
#include "xe_ring_ops_types.h"
|
#include "xe_ring_ops_types.h"
|
||||||
#include "xe_sched_job.h"
|
#include "xe_sched_job.h"
|
||||||
#include "xe_trace.h"
|
#include "xe_trace.h"
|
||||||
@@ -868,26 +867,38 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
|
|||||||
xe_sched_tdr_queue_imm(&q->guc->sched);
|
xe_sched_tdr_queue_imm(&q->guc->sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void guc_submit_wedged(struct xe_guc *guc)
|
static bool guc_submit_hint_wedged(struct xe_guc *guc)
|
||||||
{
|
{
|
||||||
|
struct xe_device *xe = guc_to_xe(guc);
|
||||||
struct xe_exec_queue *q;
|
struct xe_exec_queue *q;
|
||||||
unsigned long index;
|
unsigned long index;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
xe_device_declare_wedged(guc_to_xe(guc));
|
if (xe->wedged.mode != 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (xe_device_wedged(xe))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
xe_device_declare_wedged(xe);
|
||||||
|
|
||||||
xe_guc_submit_reset_prepare(guc);
|
xe_guc_submit_reset_prepare(guc);
|
||||||
xe_guc_ct_stop(&guc->ct);
|
xe_guc_ct_stop(&guc->ct);
|
||||||
|
|
||||||
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
|
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
|
||||||
guc_submit_wedged_fini, guc);
|
guc_submit_wedged_fini, guc);
|
||||||
if (err)
|
if (err) {
|
||||||
return;
|
drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
|
||||||
|
return true; /* Device is wedged anyway */
|
||||||
|
}
|
||||||
|
|
||||||
mutex_lock(&guc->submission_state.lock);
|
mutex_lock(&guc->submission_state.lock);
|
||||||
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
|
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
|
||||||
if (xe_exec_queue_get_unless_zero(q))
|
if (xe_exec_queue_get_unless_zero(q))
|
||||||
set_exec_queue_wedged(q);
|
set_exec_queue_wedged(q);
|
||||||
mutex_unlock(&guc->submission_state.lock);
|
mutex_unlock(&guc->submission_state.lock);
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
|
static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
|
||||||
@@ -898,15 +909,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
|
|||||||
struct xe_guc *guc = exec_queue_to_guc(q);
|
struct xe_guc *guc = exec_queue_to_guc(q);
|
||||||
struct xe_device *xe = guc_to_xe(guc);
|
struct xe_device *xe = guc_to_xe(guc);
|
||||||
struct xe_gpu_scheduler *sched = &ge->sched;
|
struct xe_gpu_scheduler *sched = &ge->sched;
|
||||||
bool wedged = xe_device_wedged(xe);
|
bool wedged;
|
||||||
|
|
||||||
xe_assert(xe, xe_exec_queue_is_lr(q));
|
xe_assert(xe, xe_exec_queue_is_lr(q));
|
||||||
trace_xe_exec_queue_lr_cleanup(q);
|
trace_xe_exec_queue_lr_cleanup(q);
|
||||||
|
|
||||||
if (!wedged && xe_modparam.wedged_mode == 2) {
|
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
|
||||||
guc_submit_wedged(exec_queue_to_guc(q));
|
|
||||||
wedged = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Kill the run_job / process_msg entry points */
|
/* Kill the run_job / process_msg entry points */
|
||||||
xe_sched_submission_stop(sched);
|
xe_sched_submission_stop(sched);
|
||||||
@@ -957,7 +965,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
|
|||||||
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
|
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
|
||||||
int err = -ETIME;
|
int err = -ETIME;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
bool wedged = xe_device_wedged(xe);
|
bool wedged;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* TDR has fired before free job worker. Common if exec queue
|
* TDR has fired before free job worker. Common if exec queue
|
||||||
@@ -981,10 +989,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
|
|||||||
|
|
||||||
trace_xe_sched_job_timedout(job);
|
trace_xe_sched_job_timedout(job);
|
||||||
|
|
||||||
if (!wedged && xe_modparam.wedged_mode == 2) {
|
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
|
||||||
guc_submit_wedged(exec_queue_to_guc(q));
|
|
||||||
wedged = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Kill the run_job entry point */
|
/* Kill the run_job entry point */
|
||||||
xe_sched_submission_stop(sched);
|
xe_sched_submission_stop(sched);
|
||||||
|
|||||||
Reference in New Issue
Block a user