mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-19 00:04:18 +00:00
According to: https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html the usage of asm pseudo directives in the asm template can confuse the compiler to wrongly estimate the size of the generated code. The LOCK_PREFIX macro expands to several asm pseudo directives, so its usage in atomic locking insns causes instruction length estimates to fail significantly (the specially instrumented compiler reports the estimated length of these asm templates to be 6 instructions long). This incorrect estimate further causes unoptimal inlining decisions, un-optimal instruction scheduling and un-optimal code block alignments for functions that use these locking primitives. Use asm_inline instead: https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html which is a feature that makes GCC pretend some inline assembler code is tiny (while it would think it is huge), instead of just asm. For code size estimation, the size of the asm is then taken as the minimum size of one instruction, ignoring how many instructions compiler thinks it is. bloat-o-meter reports the following code size increase (x86_64 defconfig, gcc-14.2.1): add/remove: 82/283 grow/shrink: 870/372 up/down: 76272/-43618 (32654) Total: Before=22770320, After=22802974, chg +0.14% with top grows (>500 bytes): Function old new delta ---------------------------------------------------------------- copy_process 6465 10191 +3726 balance_dirty_pages_ratelimited_flags 237 2949 +2712 icl_plane_update_noarm 5800 7969 +2169 samsung_input_mapping 3375 5170 +1795 ext4_do_update_inode.isra - 1526 +1526 __schedule 2416 3472 +1056 __i915_vma_resource_unhold - 946 +946 sched_mm_cid_after_execve 175 1097 +922 __do_sys_membarrier - 862 +862 filemap_fault 2666 3462 +796 nl80211_send_wiphy 11185 11874 +689 samsung_input_mapping.cold 900 1500 +600 virtio_gpu_queue_fenced_ctrl_buffer 839 1410 +571 ilk_update_pipe_csc 1201 1735 +534 enable_step - 525 +525 icl_color_commit_noarm 1334 1847 +513 tg3_read_bc_ver - 501 +501 and top shrinks (>500 bytes): Function old new delta ---------------------------------------------------------------- nl80211_send_iftype_data 580 - -580 samsung_gamepad_input_mapping.isra.cold 604 - -604 virtio_gpu_queue_ctrl_sgs 724 - -724 tg3_get_invariants 9218 8376 -842 __i915_vma_resource_unhold.part 899 - -899 ext4_mark_iloc_dirty 1735 106 -1629 samsung_gamepad_input_mapping.isra 2046 - -2046 icl_program_input_csc 2203 - -2203 copy_mm 2242 - -2242 balance_dirty_pages 2657 - -2657 These code size changes can be grouped into 4 groups: a) some functions now include once-called functions in full or in part. These are: Function old new delta ---------------------------------------------------------------- copy_process 6465 10191 +3726 balance_dirty_pages_ratelimited_flags 237 2949 +2712 icl_plane_update_noarm 5800 7969 +2169 samsung_input_mapping 3375 5170 +1795 ext4_do_update_inode.isra - 1526 +1526 that now include: Function old new delta ---------------------------------------------------------------- copy_mm 2242 - -2242 balance_dirty_pages 2657 - -2657 icl_program_input_csc 2203 - -2203 samsung_gamepad_input_mapping.isra 2046 - -2046 ext4_mark_iloc_dirty 1735 106 -1629 b) ISRA [interprocedural scalar replacement of aggregates, interprocedural pass that removes unused function return values (turning functions returning a value which is never used into void functions) and removes unused function parameters. It can also replace an aggregate parameter by a set of other parameters representing part of the original, turning those passed by reference into new ones which pass the value directly.] Top grows and shrinks of this group are listed below: Function old new delta ---------------------------------------------------------------- ext4_do_update_inode.isra - 1526 +1526 nfs4_begin_drain_session.isra - 249 +249 nfs4_end_drain_session.isra - 168 +168 __guc_action_register_multi_lrc_v70.isra 335 500 +165 __i915_gem_free_objects.isra - 144 +144 ... membarrier_register_private_expedited.isra 108 - -108 syncobj_eventfd_entry_func.isra 445 314 -131 __ext4_sb_bread_gfp.isra 140 - -140 class_preempt_notrace_destructor.isra 145 - -145 p9_fid_put.isra 151 - -151 __mm_cid_try_get.isra 238 - -238 membarrier_global_expedited.isra 294 - -294 mm_cid_get.isra 295 - -295 samsung_gamepad_input_mapping.isra.cold 604 - -604 samsung_gamepad_input_mapping.isra 2046 - -2046 c) different split points of hot/cold split that just move code around: Top grows and shrinks of this group are listed below: Function old new delta ---------------------------------------------------------------- samsung_input_mapping.cold 900 1500 +600 __i915_request_reset.cold 311 389 +78 nfs_update_inode.cold 77 153 +76 __do_sys_swapon.cold 404 455 +51 copy_process.cold - 45 +45 tg3_get_invariants.cold 73 115 +42 ... hibernate.cold 671 643 -28 copy_mm.cold 31 - -31 software_resume.cold 249 207 -42 io_poll_wake.cold 106 54 -52 samsung_gamepad_input_mapping.isra.cold 604 - -604 c) full inline of small functions with locking insn (~150 cases). These bring in most of the code size increase because the removed function code is now inlined in multiple places. E.g.: 0000000000a50e10 <release_devnum>: a50e10: 48 63 07 movslq (%rdi),%rax a50e13: 85 c0 test %eax,%eax a50e15: 7e 10 jle a50e27 <release_devnum+0x17> a50e17: 48 8b 4f 50 mov 0x50(%rdi),%rcx a50e1b: f0 48 0f b3 41 50 lock btr %rax,0x50(%rcx) a50e21: c7 07 ff ff ff ff movl $0xffffffff,(%rdi) a50e27: e9 00 00 00 00 jmp a50e2c <release_devnum+0x1c> a50e28: R_X86_64_PLT32 __x86_return_thunk-0x4 a50e2c: 0f 1f 40 00 nopl 0x0(%rax) is now fully inlined into the caller function. This is desirable due to the per function overhead of CPU bug mitigations like retpolines. FTR a) with -Os (where generated code size really matters) x86_64 defconfig object file decreases by 24.388 kbytes, representing 0.1% code size decrease: text data bss dec hex filename 23883860 4617284 814212 29315356 1bf511c vmlinux-old.o 23859472 4615404 814212 29289088 1beea80 vmlinux-new.o FTR b) clang recognizes "asm inline", but there was no difference in code sizes: text data bss dec hex filename 27577163 4503078 807732 32887973 1f5d4a5 vmlinux-clang-patched.o 27577181 4503078 807732 32887991 1f5d4b7 vmlinux-clang-unpatched.o The performance impact of the patch was assessed by recompiling fedora-41 6.13.5 kernel and running lmbench with old and new kernel. The most noticeable improvements were: Process fork+exit: 270.0952 microseconds Process fork+execve: 2620.3333 microseconds Process fork+/bin/sh -c: 6781.0000 microseconds File /usr/tmp/XXX write bandwidth: 1780350 KB/sec Pagefaults on /usr/tmp/XXX: 0.3875 microseconds to: Process fork+exit: 298.6842 microseconds Process fork+execve: 1662.7500 microseconds Process fork+/bin/sh -c: 2127.6667 microseconds File /usr/tmp/XXX write bandwidth: 1950077 KB/sec Pagefaults on /usr/tmp/XXX: 0.1958 microseconds and from: Socket bandwidth using localhost 0.000001 2.52 MB/sec 0.000064 163.02 MB/sec 0.000128 321.70 MB/sec 0.000256 630.06 MB/sec 0.000512 1207.07 MB/sec 0.001024 2004.06 MB/sec 0.001437 2475.43 MB/sec 10.000000 5817.34 MB/sec Avg xfer: 3.2KB, 41.8KB in 1.2230 millisecs, 34.15 MB/sec AF_UNIX sock stream bandwidth: 9850.01 MB/sec Pipe bandwidth: 4631.28 MB/sec to: Socket bandwidth using localhost 0.000001 3.13 MB/sec 0.000064 187.08 MB/sec 0.000128 324.12 MB/sec 0.000256 618.51 MB/sec 0.000512 1137.13 MB/sec 0.001024 1962.95 MB/sec 0.001437 2458.27 MB/sec 10.000000 6168.08 MB/sec Avg xfer: 3.2KB, 41.8KB in 1.0060 millisecs, 41.52 MB/sec AF_UNIX sock stream bandwidth: 9921.68 MB/sec Pipe bandwidth: 4649.96 MB/sec [ mingo: Prettified the changelog a bit. ] Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Brian Gerst <brgerst@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Link: https://lore.kernel.org/r/20250309170955.48919-1-ubizjak@gmail.com
442 lines
11 KiB
C
442 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_BITOPS_H
|
|
#define _ASM_X86_BITOPS_H
|
|
|
|
/*
|
|
* Copyright 1992, Linus Torvalds.
|
|
*
|
|
* Note: inlines with more than a single statement should be marked
|
|
* __always_inline to avoid problems with older gcc's inlining heuristics.
|
|
*/
|
|
|
|
#ifndef _LINUX_BITOPS_H
|
|
#error only <linux/bitops.h> can be included directly
|
|
#endif
|
|
|
|
#include <linux/compiler.h>
|
|
#include <asm/alternative.h>
|
|
#include <asm/rmwcc.h>
|
|
#include <asm/barrier.h>
|
|
|
|
#if BITS_PER_LONG == 32
|
|
# define _BITOPS_LONG_SHIFT 5
|
|
#elif BITS_PER_LONG == 64
|
|
# define _BITOPS_LONG_SHIFT 6
|
|
#else
|
|
# error "Unexpected BITS_PER_LONG"
|
|
#endif
|
|
|
|
#define BIT_64(n) (U64_C(1) << (n))
|
|
|
|
/*
|
|
* These have to be done with inline assembly: that way the bit-setting
|
|
* is guaranteed to be atomic. All bit operations return 0 if the bit
|
|
* was cleared before the operation and != 0 if it was not.
|
|
*
|
|
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
|
|
*/
|
|
|
|
#define RLONG_ADDR(x) "m" (*(volatile long *) (x))
|
|
#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x))
|
|
|
|
#define ADDR RLONG_ADDR(addr)
|
|
|
|
/*
|
|
* We do the locked ops that don't return the old value as
|
|
* a mask operation on a byte.
|
|
*/
|
|
#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3))
|
|
#define CONST_MASK(nr) (1 << ((nr) & 7))
|
|
|
|
static __always_inline void
|
|
arch_set_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
if (__builtin_constant_p(nr)) {
|
|
asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" (CONST_MASK(nr))
|
|
: "memory");
|
|
} else {
|
|
asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
|
|
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
|
|
}
|
|
}
|
|
|
|
static __always_inline void
|
|
arch___set_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
|
|
}
|
|
|
|
static __always_inline void
|
|
arch_clear_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
if (__builtin_constant_p(nr)) {
|
|
asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" (~CONST_MASK(nr)));
|
|
} else {
|
|
asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
|
|
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
|
|
}
|
|
}
|
|
|
|
static __always_inline void
|
|
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
|
|
{
|
|
barrier();
|
|
arch_clear_bit(nr, addr);
|
|
}
|
|
|
|
static __always_inline void
|
|
arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
|
|
}
|
|
|
|
static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
|
|
volatile unsigned long *addr)
|
|
{
|
|
bool negative;
|
|
asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
|
|
CC_SET(s)
|
|
: CC_OUT(s) (negative), WBYTE_ADDR(addr)
|
|
: "iq" ((char)mask) : "memory");
|
|
return negative;
|
|
}
|
|
#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
|
|
|
|
static __always_inline void
|
|
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
|
|
{
|
|
arch___clear_bit(nr, addr);
|
|
}
|
|
|
|
static __always_inline void
|
|
arch___change_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
|
|
}
|
|
|
|
static __always_inline void
|
|
arch_change_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
if (__builtin_constant_p(nr)) {
|
|
asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" (CONST_MASK(nr)));
|
|
} else {
|
|
asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
|
|
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
|
|
}
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
|
|
{
|
|
return arch_test_and_set_bit(nr, addr);
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
bool oldbit;
|
|
|
|
asm(__ASM_SIZE(bts) " %2,%1"
|
|
CC_SET(c)
|
|
: CC_OUT(c) (oldbit)
|
|
: ADDR, "Ir" (nr) : "memory");
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
|
|
}
|
|
|
|
/*
|
|
* Note: the operation is performed atomically with respect to
|
|
* the local CPU, but not other CPUs. Portable code should not
|
|
* rely on this behaviour.
|
|
* KVM relies on this behaviour on x86 for modifying memory that is also
|
|
* accessed from a hypervisor on the same CPU if running in a VM: don't change
|
|
* this without also updating arch/x86/kernel/kvm.c
|
|
*/
|
|
static __always_inline bool
|
|
arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
bool oldbit;
|
|
|
|
asm volatile(__ASM_SIZE(btr) " %2,%1"
|
|
CC_SET(c)
|
|
: CC_OUT(c) (oldbit)
|
|
: ADDR, "Ir" (nr) : "memory");
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
|
|
{
|
|
bool oldbit;
|
|
|
|
asm volatile(__ASM_SIZE(btc) " %2,%1"
|
|
CC_SET(c)
|
|
: CC_OUT(c) (oldbit)
|
|
: ADDR, "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
|
|
{
|
|
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
|
|
}
|
|
|
|
static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
|
|
{
|
|
return ((1UL << (nr & (BITS_PER_LONG-1))) &
|
|
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
|
|
}
|
|
|
|
static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
|
|
{
|
|
bool oldbit;
|
|
|
|
asm volatile("testb %2,%1"
|
|
CC_SET(nz)
|
|
: CC_OUT(nz) (oldbit)
|
|
: "m" (((unsigned char *)addr)[nr >> 3]),
|
|
"i" (1 << (nr & 7))
|
|
:"memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
|
|
{
|
|
bool oldbit;
|
|
|
|
asm volatile(__ASM_SIZE(bt) " %2,%1"
|
|
CC_SET(c)
|
|
: CC_OUT(c) (oldbit)
|
|
: "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
|
|
{
|
|
return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
|
|
variable_test_bit(nr, addr);
|
|
}
|
|
|
|
static __always_inline bool
|
|
arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
|
|
{
|
|
return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
|
|
variable_test_bit(nr, addr);
|
|
}
|
|
|
|
static __always_inline unsigned long variable__ffs(unsigned long word)
|
|
{
|
|
asm("rep; bsf %1,%0"
|
|
: "=r" (word)
|
|
: ASM_INPUT_RM (word));
|
|
return word;
|
|
}
|
|
|
|
/**
|
|
* __ffs - find first set bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no bit exists, so code should check against 0 first.
|
|
*/
|
|
#define __ffs(word) \
|
|
(__builtin_constant_p(word) ? \
|
|
(unsigned long)__builtin_ctzl(word) : \
|
|
variable__ffs(word))
|
|
|
|
static __always_inline unsigned long variable_ffz(unsigned long word)
|
|
{
|
|
asm("rep; bsf %1,%0"
|
|
: "=r" (word)
|
|
: "r" (~word));
|
|
return word;
|
|
}
|
|
|
|
/**
|
|
* ffz - find first zero bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no zero exists, so code should check against ~0UL first.
|
|
*/
|
|
#define ffz(word) \
|
|
(__builtin_constant_p(word) ? \
|
|
(unsigned long)__builtin_ctzl(~word) : \
|
|
variable_ffz(word))
|
|
|
|
/*
|
|
* __fls: find last set bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no set bit exists, so code should check against 0 first.
|
|
*/
|
|
static __always_inline unsigned long __fls(unsigned long word)
|
|
{
|
|
if (__builtin_constant_p(word))
|
|
return BITS_PER_LONG - 1 - __builtin_clzl(word);
|
|
|
|
asm("bsr %1,%0"
|
|
: "=r" (word)
|
|
: ASM_INPUT_RM (word));
|
|
return word;
|
|
}
|
|
|
|
#undef ADDR
|
|
|
|
#ifdef __KERNEL__
|
|
static __always_inline int variable_ffs(int x)
|
|
{
|
|
int r;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
/*
|
|
* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before, except that the
|
|
* top 32 bits will be cleared.
|
|
*
|
|
* We cannot do this on 32 bits because at the very least some
|
|
* 486 CPUs did not behave this way.
|
|
*/
|
|
asm("bsfl %1,%0"
|
|
: "=r" (r)
|
|
: ASM_INPUT_RM (x), "0" (-1));
|
|
#elif defined(CONFIG_X86_CMOV)
|
|
asm("bsfl %1,%0\n\t"
|
|
"cmovzl %2,%0"
|
|
: "=&r" (r) : "rm" (x), "r" (-1));
|
|
#else
|
|
asm("bsfl %1,%0\n\t"
|
|
"jnz 1f\n\t"
|
|
"movl $-1,%0\n"
|
|
"1:" : "=r" (r) : "rm" (x));
|
|
#endif
|
|
return r + 1;
|
|
}
|
|
|
|
/**
|
|
* ffs - find first set bit in word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined the same way as the libc and compiler builtin ffs
|
|
* routines, therefore differs in spirit from the other bitops.
|
|
*
|
|
* ffs(value) returns 0 if value is 0 or the position of the first
|
|
* set bit if value is nonzero. The first (least significant) bit
|
|
* is at position 1.
|
|
*/
|
|
#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
|
|
|
|
/**
|
|
* fls - find last set bit in word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined in a similar way as the libc and compiler builtin
|
|
* ffs, but returns the position of the most significant set bit.
|
|
*
|
|
* fls(value) returns 0 if value is 0 or the position of the last
|
|
* set bit if value is nonzero. The last (most significant) bit is
|
|
* at position 32.
|
|
*/
|
|
static __always_inline int fls(unsigned int x)
|
|
{
|
|
int r;
|
|
|
|
if (__builtin_constant_p(x))
|
|
return x ? 32 - __builtin_clz(x) : 0;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
/*
|
|
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before, except that the
|
|
* top 32 bits will be cleared.
|
|
*
|
|
* We cannot do this on 32 bits because at the very least some
|
|
* 486 CPUs did not behave this way.
|
|
*/
|
|
asm("bsrl %1,%0"
|
|
: "=r" (r)
|
|
: ASM_INPUT_RM (x), "0" (-1));
|
|
#elif defined(CONFIG_X86_CMOV)
|
|
asm("bsrl %1,%0\n\t"
|
|
"cmovzl %2,%0"
|
|
: "=&r" (r) : "rm" (x), "rm" (-1));
|
|
#else
|
|
asm("bsrl %1,%0\n\t"
|
|
"jnz 1f\n\t"
|
|
"movl $-1,%0\n"
|
|
"1:" : "=r" (r) : "rm" (x));
|
|
#endif
|
|
return r + 1;
|
|
}
|
|
|
|
/**
|
|
* fls64 - find last set bit in a 64-bit word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined in a similar way as the libc and compiler builtin
|
|
* ffsll, but returns the position of the most significant set bit.
|
|
*
|
|
* fls64(value) returns 0 if value is 0 or the position of the last
|
|
* set bit if value is nonzero. The last (most significant) bit is
|
|
* at position 64.
|
|
*/
|
|
#ifdef CONFIG_X86_64
|
|
static __always_inline int fls64(__u64 x)
|
|
{
|
|
int bitpos = -1;
|
|
|
|
if (__builtin_constant_p(x))
|
|
return x ? 64 - __builtin_clzll(x) : 0;
|
|
/*
|
|
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before.
|
|
*/
|
|
asm("bsrq %1,%q0"
|
|
: "+r" (bitpos)
|
|
: ASM_INPUT_RM (x));
|
|
return bitpos + 1;
|
|
}
|
|
#else
|
|
#include <asm-generic/bitops/fls64.h>
|
|
#endif
|
|
|
|
#include <asm-generic/bitops/sched.h>
|
|
|
|
#include <asm/arch_hweight.h>
|
|
|
|
#include <asm-generic/bitops/const_hweight.h>
|
|
|
|
#include <asm-generic/bitops/instrumented-atomic.h>
|
|
#include <asm-generic/bitops/instrumented-non-atomic.h>
|
|
#include <asm-generic/bitops/instrumented-lock.h>
|
|
|
|
#include <asm-generic/bitops/le.h>
|
|
|
|
#include <asm-generic/bitops/ext2-atomic-setbit.h>
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* _ASM_X86_BITOPS_H */
|