mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-14 05:49:55 +00:00
Adding uretprobe syscall instead of trap to speed up return probe.
At the moment the uretprobe setup/path is:
- install entry uprobe
- when the uprobe is hit, it overwrites probed function's return address
on stack with address of the trampoline that contains breakpoint
instruction
- the breakpoint trap code handles the uretprobe consumers execution and
jumps back to original return address
This patch replaces the above trampoline's breakpoint instruction with new
ureprobe syscall call. This syscall does exactly the same job as the trap
with some more extra work:
- syscall trampoline must save original value for rax/r11/rcx registers
on stack - rax is set to syscall number and r11/rcx are changed and
used by syscall instruction
- the syscall code reads the original values of those registers and
restore those values in task's pt_regs area
- only caller from trampoline exposed in '[uprobes]' is allowed,
the process will receive SIGILL signal otherwise
Even with some extra work, using the uretprobes syscall shows speed
improvement (compared to using standard breakpoint):
On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)
current:
uretprobe-nop : 1.498 ± 0.000M/s
uretprobe-push : 1.448 ± 0.001M/s
uretprobe-ret : 0.816 ± 0.001M/s
with the fix:
uretprobe-nop : 1.969 ± 0.002M/s < 31% speed up
uretprobe-push : 1.910 ± 0.000M/s < 31% speed up
uretprobe-ret : 0.934 ± 0.000M/s < 14% speed up
On Amd (AMD Ryzen 7 5700U)
current:
uretprobe-nop : 0.778 ± 0.001M/s
uretprobe-push : 0.744 ± 0.001M/s
uretprobe-ret : 0.540 ± 0.001M/s
with the fix:
uretprobe-nop : 0.860 ± 0.001M/s < 10% speed up
uretprobe-push : 0.818 ± 0.001M/s < 10% speed up
uretprobe-ret : 0.578 ± 0.000M/s < 7% speed up
The performance test spawns a thread that runs loop which triggers
uprobe with attached bpf program that increments the counter that
gets printed in results above.
The uprobe (and uretprobe) kind is determined by which instruction
is being patched with breakpoint instruction. That's also important
for uretprobes, because uprobe is installed for each uretprobe.
The performance test is part of bpf selftests:
tools/testing/selftests/bpf/run_bench_uprobes.sh
Note at the moment uretprobe syscall is supported only for native
64-bit process, compat process still uses standard breakpoint.
Note that when shadow stack is enabled the uretprobe syscall returns
via iret, which is slower than return via sysret, but won't cause the
shadow stack violation.
Link: https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
43 lines
1.4 KiB
C
43 lines
1.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_SHSTK_H
|
|
#define _ASM_X86_SHSTK_H
|
|
|
|
#ifndef __ASSEMBLY__
|
|
#include <linux/types.h>
|
|
|
|
struct task_struct;
|
|
struct ksignal;
|
|
|
|
#ifdef CONFIG_X86_USER_SHADOW_STACK
|
|
struct thread_shstk {
|
|
u64 base;
|
|
u64 size;
|
|
};
|
|
|
|
long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
|
|
void reset_thread_features(void);
|
|
unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
|
|
unsigned long stack_size);
|
|
void shstk_free(struct task_struct *p);
|
|
int setup_signal_shadow_stack(struct ksignal *ksig);
|
|
int restore_signal_shadow_stack(void);
|
|
int shstk_update_last_frame(unsigned long val);
|
|
bool shstk_is_enabled(void);
|
|
#else
|
|
static inline long shstk_prctl(struct task_struct *task, int option,
|
|
unsigned long arg2) { return -EINVAL; }
|
|
static inline void reset_thread_features(void) {}
|
|
static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
|
|
unsigned long clone_flags,
|
|
unsigned long stack_size) { return 0; }
|
|
static inline void shstk_free(struct task_struct *p) {}
|
|
static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
|
|
static inline int restore_signal_shadow_stack(void) { return 0; }
|
|
static inline int shstk_update_last_frame(unsigned long val) { return 0; }
|
|
static inline bool shstk_is_enabled(void) { return false; }
|
|
#endif /* CONFIG_X86_USER_SHADOW_STACK */
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
#endif /* _ASM_X86_SHSTK_H */
|