Files
linux/net/sched/act_sample.c
Linus Torvalds 7e68dd7d07 Merge tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni:
 "Core:

   - Allow live renaming when an interface is up

   - Add retpoline wrappers for tc, improving considerably the
     performances of complex queue discipline configurations

   - Add inet drop monitor support

   - A few GRO performance improvements

   - Add infrastructure for atomic dev stats, addressing long standing
     data races

   - De-duplicate common code between OVS and conntrack offloading
     infrastructure

   - A bunch of UBSAN_BOUNDS/FORTIFY_SOURCE improvements

   - Netfilter: introduce packet parser for tunneled packets

   - Replace IPVS timer-based estimators with kthreads to scale up the
     workload with the number of available CPUs

   - Add the helper support for connection-tracking OVS offload

  BPF:

   - Support for user defined BPF objects: the use case is to allocate
     own objects, build own object hierarchies and use the building
     blocks to build own data structures flexibly, for example, linked
     lists in BPF

   - Make cgroup local storage available to non-cgroup attached BPF
     programs

   - Avoid unnecessary deadlock detection and failures wrt BPF task
     storage helpers

   - A relevant bunch of BPF verifier fixes and improvements

   - Veristat tool improvements to support custom filtering, sorting,
     and replay of results

   - Add LLVM disassembler as default library for dumping JITed code

   - Lots of new BPF documentation for various BPF maps

   - Add bpf_rcu_read_{,un}lock() support for sleepable programs

   - Add RCU grace period chaining to BPF to wait for the completion of
     access from both sleepable and non-sleepable BPF programs

   - Add support storing struct task_struct objects as kptrs in maps

   - Improve helper UAPI by explicitly defining BPF_FUNC_xxx integer
     values

   - Add libbpf *_opts API-variants for bpf_*_get_fd_by_id() functions

  Protocols:

   - TCP: implement Protective Load Balancing across switch links

   - TCP: allow dynamically disabling TCP-MD5 static key, reverting back
     to fast[er]-path

   - UDP: Introduce optional per-netns hash lookup table

   - IPv6: simplify and cleanup sockets disposal

   - Netlink: support different type policies for each generic netlink
     operation

   - MPTCP: add MSG_FASTOPEN and FastOpen listener side support

   - MPTCP: add netlink notification support for listener sockets events

   - SCTP: add VRF support, allowing sctp sockets binding to VRF devices

   - Add bridging MAC Authentication Bypass (MAB) support

   - Extensions for Ethernet VPN bridging implementation to better
     support multicast scenarios

   - More work for Wi-Fi 7 support, comprising conversion of all the
     existing drivers to internal TX queue usage

   - IPSec: introduce a new offload type (packet offload) allowing
     complete header processing and crypto offloading

   - IPSec: extended ack support for more descriptive XFRM error
     reporting

   - RXRPC: increase SACK table size and move processing into a
     per-local endpoint kernel thread, reducing considerably the
     required locking

   - IEEE 802154: synchronous send frame and extended filtering support,
     initial support for scanning available 15.4 networks

   - Tun: bump the link speed from 10Mbps to 10Gbps

   - Tun/VirtioNet: implement UDP segmentation offload support

  Driver API:

   - PHY/SFP: improve power level switching between standard level 1 and
     the higher power levels

   - New API for netdev <-> devlink_port linkage

   - PTP: convert existing drivers to new frequency adjustment
     implementation

   - DSA: add support for rx offloading

   - Autoload DSA tagging driver when dynamically changing protocol

   - Add new PCP and APPTRUST attributes to Data Center Bridging

   - Add configuration support for 800Gbps link speed

   - Add devlink port function attribute to enable/disable RoCE and
     migratable

   - Extend devlink-rate to support strict prioriry and weighted fair
     queuing

   - Add devlink support to directly reading from region memory

   - New device tree helper to fetch MAC address from nvmem

   - New big TCP helper to simplify temporary header stripping

  New hardware / drivers:

   - Ethernet:
      - Marvel Octeon CNF95N and CN10KB Ethernet Switches
      - Marvel Prestera AC5X Ethernet Switch
      - WangXun 10 Gigabit NIC
      - Motorcomm yt8521 Gigabit Ethernet
      - Microchip ksz9563 Gigabit Ethernet Switch
      - Microsoft Azure Network Adapter
      - Linux Automation 10Base-T1L adapter

   - PHY:
      - Aquantia AQR112 and AQR412
      - Motorcomm YT8531S

   - PTP:
      - Orolia ART-CARD

   - WiFi:
      - MediaTek Wi-Fi 7 (802.11be) devices
      - RealTek rtw8821cu, rtw8822bu, rtw8822cu and rtw8723du USB
        devices

   - Bluetooth:
      - Broadcom BCM4377/4378/4387 Bluetooth chipsets
      - Realtek RTL8852BE and RTL8723DS
      - Cypress.CYW4373A0 WiFi + Bluetooth combo device

  Drivers:

   - CAN:
      - gs_usb: bus error reporting support
      - kvaser_usb: listen only and bus error reporting support

   - Ethernet NICs:
      - Intel (100G):
         - extend action skbedit to RX queue mapping
         - implement devlink-rate support
         - support direct read from memory
      - nVidia/Mellanox (mlx5):
         - SW steering improvements, increasing rules update rate
         - Support for enhanced events compression
         - extend H/W offload packet manipulation capabilities
         - implement IPSec packet offload mode
      - nVidia/Mellanox (mlx4):
         - better big TCP support
      - Netronome Ethernet NICs (nfp):
         - IPsec offload support
         - add support for multicast filter
      - Broadcom:
         - RSS and PTP support improvements
      - AMD/SolarFlare:
         - netlink extened ack improvements
         - add basic flower matches to offload, and related stats
      - Virtual NICs:
         - ibmvnic: introduce affinity hint support
      - small / embedded:
         - FreeScale fec: add initial XDP support
         - Marvel mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood
         - TI am65-cpsw: add suspend/resume support
         - Mediatek MT7986: add RX wireless wthernet dispatch support
         - Realtek 8169: enable GRO software interrupt coalescing per
           default

   - Ethernet high-speed switches:
      - Microchip (sparx5):
         - add support for Sparx5 TC/flower H/W offload via VCAP
      - Mellanox mlxsw:
         - add 802.1X and MAC Authentication Bypass offload support
         - add ip6gre support

   - Embedded Ethernet switches:
      - Mediatek (mtk_eth_soc):
         - improve PCS implementation, add DSA untag support
         - enable flow offload support
      - Renesas:
         - add rswitch R-Car Gen4 gPTP support
      - Microchip (lan966x):
         - add full XDP support
         - add TC H/W offload via VCAP
         - enable PTP on bridge interfaces
      - Microchip (ksz8):
         - add MTU support for KSZ8 series

   - Qualcomm 802.11ax WiFi (ath11k):
      - support configuring channel dwell time during scan

   - MediaTek WiFi (mt76):
      - enable Wireless Ethernet Dispatch (WED) offload support
      - add ack signal support
      - enable coredump support
      - remain_on_channel support

   - Intel WiFi (iwlwifi):
      - enable Wi-Fi 7 Extremely High Throughput (EHT) PHY capabilities
      - 320 MHz channels support

   - RealTek WiFi (rtw89):
      - new dynamic header firmware format support
      - wake-over-WLAN support"

* tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2002 commits)
  ipvs: fix type warning in do_div() on 32 bit
  net: lan966x: Remove a useless test in lan966x_ptp_add_trap()
  net: ipa: add IPA v4.7 support
  dt-bindings: net: qcom,ipa: Add SM6350 compatible
  bnxt: Use generic HBH removal helper in tx path
  IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver
  selftests: forwarding: Add bridge MDB test
  selftests: forwarding: Rename bridge_mdb test
  bridge: mcast: Support replacement of MDB port group entries
  bridge: mcast: Allow user space to specify MDB entry routing protocol
  bridge: mcast: Allow user space to add (*, G) with a source list and filter mode
  bridge: mcast: Add support for (*, G) with a source list and filter mode
  bridge: mcast: Avoid arming group timer when (S, G) corresponds to a source
  bridge: mcast: Add a flag for user installed source entries
  bridge: mcast: Expose __br_multicast_del_group_src()
  bridge: mcast: Expose br_multicast_new_group_src()
  bridge: mcast: Add a centralized error path
  bridge: mcast: Place netlink policy before validation functions
  bridge: mcast: Split (*, G) and (S, G) addition into different functions
  bridge: mcast: Do not derive entry type from its filter mode
  ...
2022-12-13 15:47:48 -08:00

348 lines
8.9 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/act_sample.c - Packet sampling tc action
* Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_sample.h>
#include <net/tc_act/tc_sample.h>
#include <net/psample.h>
#include <net/pkt_cls.h>
#include <net/tc_wrapper.h>
#include <linux/if_arp.h>
static struct tc_action_ops act_sample_ops;
static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
[TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) },
[TCA_SAMPLE_RATE] = { .type = NLA_U32 },
[TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 },
[TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 },
};
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
struct tcf_proto *tp,
u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id);
bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
u32 psample_group_num, rate, index;
struct tcf_chain *goto_ch = NULL;
struct tc_sample *parm;
struct tcf_sample *s;
bool exists = false;
int ret, err;
if (!nla)
return -EINVAL;
ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla,
sample_policy, NULL);
if (ret < 0)
return ret;
if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
!tb[TCA_SAMPLE_PSAMPLE_GROUP])
return -EINVAL;
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
return 0;
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
&act_sample_ops, bind, true, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
if (err < 0)
goto release_idr;
rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
if (!rate) {
NL_SET_ERR_MSG(extack, "invalid sample rate");
err = -EINVAL;
goto put_chain;
}
psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, psample_group_num);
if (!psample_group) {
err = -ENOMEM;
goto put_chain;
}
s = to_sample(*a);
spin_lock_bh(&s->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
s->rate = rate;
s->psample_group_num = psample_group_num;
psample_group = rcu_replace_pointer(s->psample_group, psample_group,
lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
spin_unlock_bh(&s->tcf_lock);
if (psample_group)
psample_group_put(psample_group);
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
return ret;
put_chain:
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
release_idr:
tcf_idr_release(*a, bind);
return err;
}
static void tcf_sample_cleanup(struct tc_action *a)
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
/* last reference to action, no need to lock */
psample_group = rcu_dereference_protected(s->psample_group, 1);
RCU_INIT_POINTER(s->psample_group, NULL);
if (psample_group)
psample_group_put(psample_group);
}
static bool tcf_sample_dev_ok_push(struct net_device *dev)
{
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
case ARPHRD_IP6GRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return false;
default:
return true;
}
}
TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb,
const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
struct psample_metadata md = {};
int retval;
tcf_lastuse_update(&s->tcf_tm);
bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb);
retval = READ_ONCE(s->tcf_action);
psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */
if (psample_group && (get_random_u32_below(s->rate) == 0)) {
if (!skb_at_tc_ingress(skb)) {
md.in_ifindex = skb->skb_iif;
md.out_ifindex = skb->dev->ifindex;
} else {
md.in_ifindex = skb->dev->ifindex;
}
/* on ingress, the mac header gets popped, so push it back */
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_push(skb, skb->mac_len);
md.trunc_size = s->truncate ? s->trunc_size : skb->len;
psample_sample_packet(psample_group, skb, s->rate, &md);
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_pull(skb, skb->mac_len);
}
return retval;
}
static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets,
u64 drops, u64 lastuse, bool hw)
{
struct tcf_sample *s = to_sample(a);
struct tcf_t *tm = &s->tcf_tm;
tcf_action_update_stats(a, bytes, packets, drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_sample *s = to_sample(a);
struct tc_sample opt = {
.index = s->tcf_index,
.refcnt = refcount_read(&s->tcf_refcnt) - ref,
.bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
spin_lock_bh(&s->tcf_lock);
opt.action = s->tcf_action;
if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
tcf_tm_dump(&t, &s->tcf_tm);
if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
goto nla_put_failure;
if (s->truncate)
if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
goto nla_put_failure;
spin_unlock_bh(&s->tcf_lock);
return skb->len;
nla_put_failure:
spin_unlock_bh(&s->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
static void tcf_psample_group_put(void *priv)
{
struct psample_group *group = priv;
psample_group_put(group);
}
static struct psample_group *
tcf_sample_get_group(const struct tc_action *a,
tc_action_priv_destructor *destructor)
{
struct tcf_sample *s = to_sample(a);
struct psample_group *group;
group = rcu_dereference_protected(s->psample_group,
lockdep_is_held(&s->tcf_lock));
if (group) {
psample_group_take(group);
*destructor = tcf_psample_group_put;
}
return group;
}
static void tcf_offload_sample_get_group(struct flow_action_entry *entry,
const struct tc_action *act)
{
entry->sample.psample_group =
act->ops->get_psample_group(act, &entry->destructor);
entry->destructor_priv = entry->sample.psample_group;
}
static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data,
u32 *index_inc, bool bind,
struct netlink_ext_ack *extack)
{
if (bind) {
struct flow_action_entry *entry = entry_data;
entry->id = FLOW_ACTION_SAMPLE;
entry->sample.trunc_size = tcf_sample_trunc_size(act);
entry->sample.truncate = tcf_sample_truncate(act);
entry->sample.rate = tcf_sample_rate(act);
tcf_offload_sample_get_group(entry, act);
*index_inc = 1;
} else {
struct flow_offload_action *fl_action = entry_data;
fl_action->id = FLOW_ACTION_SAMPLE;
}
return 0;
}
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
.id = TCA_ID_SAMPLE,
.owner = THIS_MODULE,
.act = tcf_sample_act,
.stats_update = tcf_sample_stats_update,
.dump = tcf_sample_dump,
.init = tcf_sample_init,
.cleanup = tcf_sample_cleanup,
.get_psample_group = tcf_sample_get_group,
.offload_act_setup = tcf_sample_offload_act_setup,
.size = sizeof(struct tcf_sample),
};
static __net_init int sample_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id);
return tc_action_net_init(net, tn, &act_sample_ops);
}
static void __net_exit sample_exit_net(struct list_head *net_list)
{
tc_action_net_exit(net_list, act_sample_ops.net_id);
}
static struct pernet_operations sample_net_ops = {
.init = sample_init_net,
.exit_batch = sample_exit_net,
.id = &act_sample_ops.net_id,
.size = sizeof(struct tc_action_net),
};
static int __init sample_init_module(void)
{
return tcf_register_action(&act_sample_ops, &sample_net_ops);
}
static void __exit sample_cleanup_module(void)
{
tcf_unregister_action(&act_sample_ops, &sample_net_ops);
}
module_init(sample_init_module);
module_exit(sample_cleanup_module);
MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>");
MODULE_DESCRIPTION("Packet sampling action");
MODULE_LICENSE("GPL v2");