mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-10 11:59:48 +00:00
When running the igc with XDP/ZC in busy polling mode with deferral of hard
interrupts, interrupts still happen from time to time. That is caused by
the igb task watchdog which triggers Rx interrupts periodically.
That mechanism has been introduced to overcome skb/memory allocation
failures [1]. So the Rx clean functions stop processing the Rx ring in case
of such failure. The task watchdog triggers Rx interrupts periodically in
the hope that memory became available in the mean time.
The current behavior is undesirable for real time applications, because the
driver induced Rx interrupts trigger also the softirq processing. However,
all real time packets should be processed by the application which uses the
busy polling method.
Therefore, only trigger the Rx interrupts in case of real allocation
failures. Introduce a new flag for signaling that condition.
Follow the same logic as in commit 8dcf2c2120 ("igc: Get rid of spurious
interrupts").
[1] - https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=3be507547e6177e5c808544bd6a2efa2c7f1d436
Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Sweta Kumari <sweta.kumari@intel.com>
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
564 lines
14 KiB
C
564 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright(c) 2018 Intel Corporation. */
|
|
|
|
#include <linux/bpf_trace.h>
|
|
#include <net/xdp_sock_drv.h>
|
|
#include <net/xdp.h>
|
|
|
|
#include "e1000_hw.h"
|
|
#include "igb.h"
|
|
|
|
static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present)
|
|
{
|
|
int size = pool_present ?
|
|
sizeof(*ring->rx_buffer_info_zc) * ring->count :
|
|
sizeof(*ring->rx_buffer_info) * ring->count;
|
|
void *buff_info = vmalloc(size);
|
|
|
|
if (!buff_info)
|
|
return -ENOMEM;
|
|
|
|
if (pool_present) {
|
|
vfree(ring->rx_buffer_info);
|
|
ring->rx_buffer_info = NULL;
|
|
ring->rx_buffer_info_zc = buff_info;
|
|
} else {
|
|
vfree(ring->rx_buffer_info_zc);
|
|
ring->rx_buffer_info_zc = NULL;
|
|
ring->rx_buffer_info = buff_info;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct igb_ring *tx_ring = adapter->tx_ring[qid];
|
|
struct igb_ring *rx_ring = adapter->rx_ring[qid];
|
|
struct e1000_hw *hw = &adapter->hw;
|
|
|
|
set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
|
|
|
|
wr32(E1000_TXDCTL(tx_ring->reg_idx), 0);
|
|
wr32(E1000_RXDCTL(rx_ring->reg_idx), 0);
|
|
|
|
synchronize_net();
|
|
|
|
/* Rx/Tx share the same napi context. */
|
|
napi_disable(&rx_ring->q_vector->napi);
|
|
|
|
igb_clean_tx_ring(tx_ring);
|
|
igb_clean_rx_ring(rx_ring);
|
|
|
|
memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats));
|
|
memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats));
|
|
}
|
|
|
|
static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct igb_ring *tx_ring = adapter->tx_ring[qid];
|
|
struct igb_ring *rx_ring = adapter->rx_ring[qid];
|
|
|
|
igb_configure_tx_ring(adapter, tx_ring);
|
|
igb_configure_rx_ring(adapter, rx_ring);
|
|
|
|
synchronize_net();
|
|
|
|
clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
|
|
|
|
/* call igb_desc_unused which always leaves
|
|
* at least 1 descriptor unused to make sure
|
|
* next_to_use != next_to_clean
|
|
*/
|
|
if (rx_ring->xsk_pool)
|
|
igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool,
|
|
igb_desc_unused(rx_ring));
|
|
else
|
|
igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
|
|
|
|
/* Rx/Tx share the same napi context. */
|
|
napi_enable(&rx_ring->q_vector->napi);
|
|
}
|
|
|
|
struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
|
|
struct igb_ring *ring)
|
|
{
|
|
int qid = ring->queue_index;
|
|
struct xsk_buff_pool *pool;
|
|
|
|
pool = xsk_get_pool_from_qid(adapter->netdev, qid);
|
|
|
|
if (!igb_xdp_is_enabled(adapter))
|
|
return NULL;
|
|
|
|
return (pool && pool->dev) ? pool : NULL;
|
|
}
|
|
|
|
static int igb_xsk_pool_enable(struct igb_adapter *adapter,
|
|
struct xsk_buff_pool *pool,
|
|
u16 qid)
|
|
{
|
|
struct net_device *netdev = adapter->netdev;
|
|
struct igb_ring *rx_ring;
|
|
bool if_running;
|
|
int err;
|
|
|
|
if (qid >= adapter->num_rx_queues)
|
|
return -EINVAL;
|
|
|
|
if (qid >= netdev->real_num_rx_queues ||
|
|
qid >= netdev->real_num_tx_queues)
|
|
return -EINVAL;
|
|
|
|
err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR);
|
|
if (err)
|
|
return err;
|
|
|
|
rx_ring = adapter->rx_ring[qid];
|
|
if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
|
|
if (if_running)
|
|
igb_txrx_ring_disable(adapter, qid);
|
|
|
|
if (if_running) {
|
|
err = igb_realloc_rx_buffer_info(rx_ring, true);
|
|
if (!err) {
|
|
igb_txrx_ring_enable(adapter, qid);
|
|
/* Kick start the NAPI context so that receiving will start */
|
|
err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
|
|
}
|
|
|
|
if (err) {
|
|
xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
|
|
return err;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct xsk_buff_pool *pool;
|
|
struct igb_ring *rx_ring;
|
|
bool if_running;
|
|
int err;
|
|
|
|
pool = xsk_get_pool_from_qid(adapter->netdev, qid);
|
|
if (!pool)
|
|
return -EINVAL;
|
|
|
|
rx_ring = adapter->rx_ring[qid];
|
|
if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
|
|
if (if_running)
|
|
igb_txrx_ring_disable(adapter, qid);
|
|
|
|
xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
|
|
|
|
if (if_running) {
|
|
err = igb_realloc_rx_buffer_info(rx_ring, false);
|
|
if (err)
|
|
return err;
|
|
|
|
igb_txrx_ring_enable(adapter, qid);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int igb_xsk_pool_setup(struct igb_adapter *adapter,
|
|
struct xsk_buff_pool *pool,
|
|
u16 qid)
|
|
{
|
|
return pool ? igb_xsk_pool_enable(adapter, pool, qid) :
|
|
igb_xsk_pool_disable(adapter, qid);
|
|
}
|
|
|
|
static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
|
|
union e1000_adv_rx_desc *rx_desc, u16 count)
|
|
{
|
|
dma_addr_t dma;
|
|
u16 buffs;
|
|
int i;
|
|
|
|
/* nothing to do */
|
|
if (!count)
|
|
return 0;
|
|
|
|
buffs = xsk_buff_alloc_batch(pool, xdp, count);
|
|
for (i = 0; i < buffs; i++) {
|
|
dma = xsk_buff_xdp_get_dma(*xdp);
|
|
rx_desc->read.pkt_addr = cpu_to_le64(dma);
|
|
rx_desc->wb.upper.length = 0;
|
|
|
|
rx_desc++;
|
|
xdp++;
|
|
}
|
|
|
|
return buffs;
|
|
}
|
|
|
|
bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring,
|
|
struct xsk_buff_pool *xsk_pool, u16 count)
|
|
{
|
|
u32 nb_buffs_extra = 0, nb_buffs = 0;
|
|
union e1000_adv_rx_desc *rx_desc;
|
|
u16 ntu = rx_ring->next_to_use;
|
|
u16 total_count = count;
|
|
struct xdp_buff **xdp;
|
|
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntu);
|
|
xdp = &rx_ring->rx_buffer_info_zc[ntu];
|
|
|
|
if (ntu + count >= rx_ring->count) {
|
|
nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc,
|
|
rx_ring->count - ntu);
|
|
if (nb_buffs_extra != rx_ring->count - ntu) {
|
|
ntu += nb_buffs_extra;
|
|
goto exit;
|
|
}
|
|
rx_desc = IGB_RX_DESC(rx_ring, 0);
|
|
xdp = rx_ring->rx_buffer_info_zc;
|
|
ntu = 0;
|
|
count -= nb_buffs_extra;
|
|
}
|
|
|
|
nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count);
|
|
ntu += nb_buffs;
|
|
if (ntu == rx_ring->count)
|
|
ntu = 0;
|
|
|
|
/* clear the length for the next_to_use descriptor */
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntu);
|
|
rx_desc->wb.upper.length = 0;
|
|
|
|
exit:
|
|
if (rx_ring->next_to_use != ntu) {
|
|
rx_ring->next_to_use = ntu;
|
|
|
|
/* Force memory writes to complete before letting h/w
|
|
* know there are new descriptors to fetch. (Only
|
|
* applicable for weak-ordered memory model archs,
|
|
* such as IA-64).
|
|
*/
|
|
wmb();
|
|
writel(ntu, rx_ring->tail);
|
|
}
|
|
|
|
return total_count == (nb_buffs + nb_buffs_extra);
|
|
}
|
|
|
|
void igb_clean_rx_ring_zc(struct igb_ring *rx_ring)
|
|
{
|
|
u16 ntc = rx_ring->next_to_clean;
|
|
u16 ntu = rx_ring->next_to_use;
|
|
|
|
while (ntc != ntu) {
|
|
struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc];
|
|
|
|
xsk_buff_free(xdp);
|
|
ntc++;
|
|
if (ntc >= rx_ring->count)
|
|
ntc = 0;
|
|
}
|
|
}
|
|
|
|
static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring,
|
|
struct xdp_buff *xdp,
|
|
ktime_t timestamp)
|
|
{
|
|
unsigned int totalsize = xdp->data_end - xdp->data_meta;
|
|
unsigned int metasize = xdp->data - xdp->data_meta;
|
|
struct sk_buff *skb;
|
|
|
|
net_prefetch(xdp->data_meta);
|
|
|
|
/* allocate a skb to store the frags */
|
|
skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
|
|
if (unlikely(!skb))
|
|
return NULL;
|
|
|
|
if (timestamp)
|
|
skb_hwtstamps(skb)->hwtstamp = timestamp;
|
|
|
|
memcpy(__skb_put(skb, totalsize), xdp->data_meta,
|
|
ALIGN(totalsize, sizeof(long)));
|
|
|
|
if (metasize) {
|
|
skb_metadata_set(skb, metasize);
|
|
__skb_pull(skb, metasize);
|
|
}
|
|
|
|
return skb;
|
|
}
|
|
|
|
static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring,
|
|
struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool,
|
|
struct bpf_prog *xdp_prog)
|
|
{
|
|
int err, result = IGB_XDP_PASS;
|
|
u32 act;
|
|
|
|
prefetchw(xdp->data_hard_start); /* xdp_frame write */
|
|
|
|
act = bpf_prog_run_xdp(xdp_prog, xdp);
|
|
|
|
if (likely(act == XDP_REDIRECT)) {
|
|
err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
|
|
if (!err)
|
|
return IGB_XDP_REDIR;
|
|
|
|
if (xsk_uses_need_wakeup(xsk_pool) &&
|
|
err == -ENOBUFS)
|
|
result = IGB_XDP_EXIT;
|
|
else
|
|
result = IGB_XDP_CONSUMED;
|
|
goto out_failure;
|
|
}
|
|
|
|
switch (act) {
|
|
case XDP_PASS:
|
|
break;
|
|
case XDP_TX:
|
|
result = igb_xdp_xmit_back(adapter, xdp);
|
|
if (result == IGB_XDP_CONSUMED)
|
|
goto out_failure;
|
|
break;
|
|
default:
|
|
bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act);
|
|
fallthrough;
|
|
case XDP_ABORTED:
|
|
out_failure:
|
|
trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
|
|
fallthrough;
|
|
case XDP_DROP:
|
|
result = IGB_XDP_CONSUMED;
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
|
|
struct xsk_buff_pool *xsk_pool, const int budget)
|
|
{
|
|
struct igb_adapter *adapter = q_vector->adapter;
|
|
unsigned int total_bytes = 0, total_packets = 0;
|
|
struct igb_ring *rx_ring = q_vector->rx.ring;
|
|
u32 ntc = rx_ring->next_to_clean;
|
|
struct bpf_prog *xdp_prog;
|
|
unsigned int xdp_xmit = 0;
|
|
bool failure = false;
|
|
u16 entries_to_alloc;
|
|
struct sk_buff *skb;
|
|
|
|
/* xdp_prog cannot be NULL in the ZC path */
|
|
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
|
|
|
|
while (likely(total_packets < budget)) {
|
|
union e1000_adv_rx_desc *rx_desc;
|
|
ktime_t timestamp = 0;
|
|
struct xdp_buff *xdp;
|
|
unsigned int size;
|
|
int xdp_res = 0;
|
|
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntc);
|
|
size = le16_to_cpu(rx_desc->wb.upper.length);
|
|
if (!size)
|
|
break;
|
|
|
|
/* This memory barrier is needed to keep us from reading
|
|
* any other fields out of the rx_desc until we know the
|
|
* descriptor has been written back
|
|
*/
|
|
dma_rmb();
|
|
|
|
xdp = rx_ring->rx_buffer_info_zc[ntc];
|
|
xsk_buff_set_size(xdp, size);
|
|
xsk_buff_dma_sync_for_cpu(xdp);
|
|
|
|
/* pull rx packet timestamp if available and valid */
|
|
if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
|
|
int ts_hdr_len;
|
|
|
|
ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
|
|
xdp->data,
|
|
×tamp);
|
|
|
|
xdp->data += ts_hdr_len;
|
|
xdp->data_meta += ts_hdr_len;
|
|
size -= ts_hdr_len;
|
|
}
|
|
|
|
xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool,
|
|
xdp_prog);
|
|
|
|
if (xdp_res) {
|
|
if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) {
|
|
xdp_xmit |= xdp_res;
|
|
} else if (xdp_res == IGB_XDP_EXIT) {
|
|
failure = true;
|
|
break;
|
|
} else if (xdp_res == IGB_XDP_CONSUMED) {
|
|
xsk_buff_free(xdp);
|
|
}
|
|
|
|
total_packets++;
|
|
total_bytes += size;
|
|
ntc++;
|
|
if (ntc == rx_ring->count)
|
|
ntc = 0;
|
|
continue;
|
|
}
|
|
|
|
skb = igb_construct_skb_zc(rx_ring, xdp, timestamp);
|
|
|
|
/* exit if we failed to retrieve a buffer */
|
|
if (!skb) {
|
|
rx_ring->rx_stats.alloc_failed++;
|
|
set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags);
|
|
break;
|
|
}
|
|
|
|
xsk_buff_free(xdp);
|
|
ntc++;
|
|
if (ntc == rx_ring->count)
|
|
ntc = 0;
|
|
|
|
if (eth_skb_pad(skb))
|
|
continue;
|
|
|
|
/* probably a little skewed due to removing CRC */
|
|
total_bytes += skb->len;
|
|
|
|
/* populate checksum, timestamp, VLAN, and protocol */
|
|
igb_process_skb_fields(rx_ring, rx_desc, skb);
|
|
|
|
napi_gro_receive(&q_vector->napi, skb);
|
|
|
|
/* update budget accounting */
|
|
total_packets++;
|
|
}
|
|
|
|
rx_ring->next_to_clean = ntc;
|
|
|
|
if (xdp_xmit)
|
|
igb_finalize_xdp(adapter, xdp_xmit);
|
|
|
|
igb_update_rx_stats(q_vector, total_packets, total_bytes);
|
|
|
|
entries_to_alloc = igb_desc_unused(rx_ring);
|
|
if (entries_to_alloc >= IGB_RX_BUFFER_WRITE)
|
|
failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool,
|
|
entries_to_alloc);
|
|
|
|
if (xsk_uses_need_wakeup(xsk_pool)) {
|
|
if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
|
|
xsk_set_rx_need_wakeup(xsk_pool);
|
|
else
|
|
xsk_clear_rx_need_wakeup(xsk_pool);
|
|
|
|
return (int)total_packets;
|
|
}
|
|
return failure ? budget : (int)total_packets;
|
|
}
|
|
|
|
bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool)
|
|
{
|
|
unsigned int budget = igb_desc_unused(tx_ring);
|
|
u32 cmd_type, olinfo_status, nb_pkts, i = 0;
|
|
struct xdp_desc *descs = xsk_pool->tx_descs;
|
|
union e1000_adv_tx_desc *tx_desc = NULL;
|
|
struct igb_tx_buffer *tx_buffer_info;
|
|
unsigned int total_bytes = 0;
|
|
dma_addr_t dma;
|
|
|
|
if (!netif_carrier_ok(tx_ring->netdev))
|
|
return true;
|
|
|
|
if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))
|
|
return true;
|
|
|
|
nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget);
|
|
if (!nb_pkts)
|
|
return true;
|
|
|
|
while (nb_pkts-- > 0) {
|
|
dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
|
|
xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len);
|
|
|
|
tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
|
|
tx_buffer_info->bytecount = descs[i].len;
|
|
tx_buffer_info->type = IGB_TYPE_XSK;
|
|
tx_buffer_info->xdpf = NULL;
|
|
tx_buffer_info->gso_segs = 1;
|
|
tx_buffer_info->time_stamp = jiffies;
|
|
|
|
tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
|
|
tx_desc->read.buffer_addr = cpu_to_le64(dma);
|
|
|
|
/* put descriptor type bits */
|
|
cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
|
|
E1000_ADVTXD_DCMD_IFCS;
|
|
olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
|
|
|
|
/* FIXME: This sets the Report Status (RS) bit for every
|
|
* descriptor. One nice to have optimization would be to set it
|
|
* only for the last descriptor in the whole batch. See Intel
|
|
* ice driver for an example on how to do it.
|
|
*/
|
|
cmd_type |= descs[i].len | IGB_TXD_DCMD;
|
|
tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
|
|
tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
|
|
|
|
total_bytes += descs[i].len;
|
|
|
|
i++;
|
|
tx_ring->next_to_use++;
|
|
tx_buffer_info->next_to_watch = tx_desc;
|
|
if (tx_ring->next_to_use == tx_ring->count)
|
|
tx_ring->next_to_use = 0;
|
|
}
|
|
|
|
netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
|
|
igb_xdp_ring_update_tail(tx_ring);
|
|
|
|
return nb_pkts < budget;
|
|
}
|
|
|
|
int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
|
|
{
|
|
struct igb_adapter *adapter = netdev_priv(dev);
|
|
struct e1000_hw *hw = &adapter->hw;
|
|
struct igb_ring *ring;
|
|
u32 eics = 0;
|
|
|
|
if (test_bit(__IGB_DOWN, &adapter->state))
|
|
return -ENETDOWN;
|
|
|
|
if (!igb_xdp_is_enabled(adapter))
|
|
return -EINVAL;
|
|
|
|
if (qid >= adapter->num_tx_queues)
|
|
return -EINVAL;
|
|
|
|
ring = adapter->tx_ring[qid];
|
|
|
|
if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
|
|
return -ENETDOWN;
|
|
|
|
if (!READ_ONCE(ring->xsk_pool))
|
|
return -EINVAL;
|
|
|
|
if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
|
|
/* Cause software interrupt */
|
|
if (adapter->flags & IGB_FLAG_HAS_MSIX) {
|
|
eics |= ring->q_vector->eims_value;
|
|
wr32(E1000_EICS, eics);
|
|
} else {
|
|
wr32(E1000_ICS, E1000_ICS_RXDMT0);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|