Files
linux/tools/testing/selftests/drivers/net/stats.py
Jakub Kicinski 31eae6d995 selftests: drv-net: test drivers sleeping in ndo_get_stats64
Most of our tests use rtnetlink to read device stats, so they
don't expose the drivers much to paths in which device stats
are read under RCU. Add tests which hammer profcs reads to
make sure drivers:
 - don't sleep while reporting stats,
 - can handle parallel reads,
 - can handle device going down while reading.

Set ifname on the env class in NetDrvEnv, we already do that
in NetDrvEpEnv.

  KTAP version 1
  1..7
  ok 1 stats.check_pause
  ok 2 stats.check_fec
  ok 3 stats.pkt_byte_sum
  ok 4 stats.qstat_by_ifindex
  ok 5 stats.check_down
  ok 6 stats.procfs_hammer
  # completed up/down cycles: 6
  ok 7 stats.procfs_downup_hammer
  # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250107022932.2087744-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-01-08 19:36:46 -08:00

275 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
import errno
import subprocess
import time
from lib.py import ksft_run, ksft_exit, ksft_pr
from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises
from lib.py import KsftSkipEx, KsftXfailEx
from lib.py import ksft_disruptive
from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
from lib.py import NetDrvEnv
from lib.py import cmd, ip, defer
ethnl = EthtoolFamily()
netfam = NetdevFamily()
rtnl = RtnlFamily()
def check_pause(cfg) -> None:
global ethnl
try:
ethnl.pause_get({"header": {"dev-index": cfg.ifindex}})
except NlError as e:
if e.error == errno.EOPNOTSUPP:
raise KsftXfailEx("pause not supported by the device")
raise
data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex,
"flags": {'stats'}}})
ksft_true(data['stats'], "driver does not report stats")
def check_fec(cfg) -> None:
global ethnl
try:
ethnl.fec_get({"header": {"dev-index": cfg.ifindex}})
except NlError as e:
if e.error == errno.EOPNOTSUPP:
raise KsftXfailEx("FEC not supported by the device")
raise
data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex,
"flags": {'stats'}}})
ksft_true(data['stats'], "driver does not report stats")
def pkt_byte_sum(cfg) -> None:
global netfam, rtnl
def get_qstat(test):
global netfam
stats = netfam.qstats_get({}, dump=True)
if stats:
for qs in stats:
if qs["ifindex"]== test.ifindex:
return qs
qstat = get_qstat(cfg)
if qstat is None:
raise KsftSkipEx("qstats not supported by the device")
for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
ksft_in(key, qstat, "Drivers should always report basic keys")
# Compare stats, rtnl stats and qstats must match,
# but the interface may be up, so do a series of dumps
# each time the more "recent" stats must be higher or same.
def stat_cmp(rstat, qstat):
for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
if rstat[key] != qstat[key]:
return rstat[key] - qstat[key]
return 0
for _ in range(10):
rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
if stat_cmp(rtstat, qstat) < 0:
raise Exception("RTNL stats are lower, fetched later")
qstat = get_qstat(cfg)
if stat_cmp(rtstat, qstat) > 0:
raise Exception("Qstats are lower, fetched later")
def qstat_by_ifindex(cfg) -> None:
global netfam
global rtnl
# Construct a map ifindex -> [dump, by-index, dump]
ifindexes = {}
stats = netfam.qstats_get({}, dump=True)
for entry in stats:
ifindexes[entry['ifindex']] = [entry, None, None]
for ifindex in ifindexes.keys():
entry = netfam.qstats_get({"ifindex": ifindex}, dump=True)
ksft_eq(len(entry), 1)
ifindexes[entry[0]['ifindex']][1] = entry[0]
stats = netfam.qstats_get({}, dump=True)
for entry in stats:
ifindexes[entry['ifindex']][2] = entry
if len(ifindexes) == 0:
raise KsftSkipEx("No ifindex supports qstats")
# Now make sure the stats match/make sense
for ifindex, triple in ifindexes.items():
all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys()
for key in all_keys:
ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key)
ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key)
# Sanity check the dumps
queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True)
# Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}}
parsed = {}
for entry in queues:
ifindex = entry["ifindex"]
if ifindex not in parsed:
parsed[ifindex] = {"rx":[], "tx": []}
parsed[ifindex][entry["queue-type"]].append(entry['queue-id'])
# Now, validate
for ifindex, queues in parsed.items():
for qtype in ['rx', 'tx']:
ksft_eq(len(queues[qtype]), len(set(queues[qtype])),
comment="repeated queue keys")
ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1,
comment="missing queue keys")
# Test invalid dumps
# 0 is invalid
with ksft_raises(NlError) as cm:
netfam.qstats_get({"ifindex": 0}, dump=True)
ksft_eq(cm.exception.nl_msg.error, -34)
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
# loopback has no stats
with ksft_raises(NlError) as cm:
netfam.qstats_get({"ifindex": 1}, dump=True)
ksft_eq(cm.exception.nl_msg.error, -errno.EOPNOTSUPP)
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
# Try to get stats for lowest unused ifindex but not 0
devs = rtnl.getlink({}, dump=True)
all_ifindexes = set([dev["ifi-index"] for dev in devs])
lowest = 2
while lowest in all_ifindexes:
lowest += 1
with ksft_raises(NlError) as cm:
netfam.qstats_get({"ifindex": lowest}, dump=True)
ksft_eq(cm.exception.nl_msg.error, -19)
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
@ksft_disruptive
def check_down(cfg) -> None:
try:
qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
except NlError as e:
if e.error == errno.EOPNOTSUPP:
raise KsftSkipEx("qstats not supported by the device")
raise
ip(f"link set dev {cfg.dev['ifname']} down")
defer(ip, f"link set dev {cfg.dev['ifname']} up")
qstat2 = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
for k, v in qstat.items():
ksft_ge(qstat2[k], qstat[k], comment=f"{k} went backwards on device down")
# exercise per-queue API to make sure that "device down" state
# is handled correctly and doesn't crash
netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True)
def __run_inf_loop(body):
body = body.strip()
if body[-1] != ';':
body += ';'
return subprocess.Popen(f"while true; do {body} done", shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def __stats_increase_sanely(old, new) -> None:
for k in old.keys():
ksft_ge(new[k], old[k])
ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error")
def procfs_hammer(cfg) -> None:
"""
Reading stats via procfs only holds the RCU lock, which is not an exclusive
lock, make sure drivers can handle parallel reads of stats.
"""
one = __run_inf_loop("cat /proc/net/dev")
defer(one.kill)
two = __run_inf_loop("cat /proc/net/dev")
defer(two.kill)
time.sleep(1)
# Make sure the processes are running
ksft_is(one.poll(), None)
ksft_is(two.poll(), None)
rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
time.sleep(2)
rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
__stats_increase_sanely(rtstat1, rtstat2)
# defers will kill the loops
@ksft_disruptive
def procfs_downup_hammer(cfg) -> None:
"""
Reading stats via procfs only holds the RCU lock, drivers often try
to sleep when reading the stats, or don't protect against races.
"""
# Max out the queues, we'll flip between max and 1
channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
if channels['combined-count'] == 0:
rx_type = 'rx'
else:
rx_type = 'combined'
cur_queue_cnt = channels[f'{rx_type}-count']
max_queue_cnt = channels[f'{rx_type}-max']
cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}")
defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}")
# Real test stats
stats = __run_inf_loop("cat /proc/net/dev")
defer(stats.kill)
ipset = f"ip link set dev {cfg.ifname}"
defer(ip, f"link set dev {cfg.ifname} up")
# The "echo -n 1" lets us count iterations below
updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \
f"ethtool -L {cfg.ifname} {rx_type} 1; " + \
f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \
"echo -n 1"
updown = __run_inf_loop(updown)
kill_updown = defer(updown.kill)
time.sleep(1)
# Make sure the processes are running
ksft_is(stats.poll(), None)
ksft_is(updown.poll(), None)
rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
# We're looking for crashes, give it extra time
time.sleep(9)
rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
__stats_increase_sanely(rtstat1, rtstat2)
kill_updown.exec()
stdout, _ = updown.communicate(timeout=5)
ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8')))
def main() -> None:
with NetDrvEnv(__file__, queue_count=100) as cfg:
ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
check_down, procfs_hammer, procfs_downup_hammer],
args=(cfg, ))
ksft_exit()
if __name__ == "__main__":
main()