diff mbox series

[4/6] wifi: mt76: mt7915: Mitigate mcu communication loss.

Message ID 20240307192951.3271156-4-greearb@candelatech.com
State New
Headers show
Series [1/6] wifi: mt76: mt7996: add debugging for MCU command timeouts. | expand

Commit Message

Ben Greear March 7, 2024, 7:29 p.m. UTC
From: Ben Greear <greearb@candelatech.com>

Many calls that end up sending mcu messages to the firmware hold
RTNL or other important locks.  So when radio stops answering,
the entire system becomes very sluggish.

Add timeout counter, and if radio times out 3 times in a row,
consider it dead and no longer attempt to talk to it.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---
 drivers/net/wireless/mediatek/mt76/dma.c        | 14 ++++++++++++++
 drivers/net/wireless/mediatek/mt76/mt76.h       |  3 +++
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c |  2 ++
 3 files changed, 19 insertions(+)
diff mbox series

Patch

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 00230f106294..4d1426093e1e 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -510,6 +510,20 @@  mt76_dma_tx_queue_skb_raw(struct mt76_dev *dev, struct mt76_queue *q,
 	if (test_bit(MT76_MCU_RESET, &dev->phy.state))
 		goto error;
 
+	/* Check for non responsive radios.  Better to just stop sending it messages
+	 * than continuously block the OS (since rtnl and similar are often held while
+	 * the timeout is happening).
+	 */
+	if (dev->mcu_timeouts > MAX_MCU_TIMEOUTS) {
+		static unsigned long last_log;
+
+		if (time_after(jiffies, last_log + 5 * HZ)) {
+			last_log = jiffies;
+			mtk_dbg(dev, WRN, "mt76-dma-tx-queue-skb-raw, too many timeouts, msg is dropped.\n");
+		}
+		goto error;
+	}
+
 	if (q->queued + 1 >= q->ndesc - 1)
 		goto error;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index dd8a24cda48a..b052a9c24c73 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -832,6 +832,9 @@  struct mt76_dev {
 	struct mt76_mcu mcu;
 	u32 first_failed_mcu_cmd; /* for debugging */
 	u32 last_successful_mcu_cmd; /* for debugging */
+	u32 mcu_timeouts; /* sequential timeout counter */
+	#define MAX_MCU_TIMEOUTS 3
+
 
 	struct net_device napi_dev;
 	struct net_device tx_napi_dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index c67c4f6ca2aa..f3e60fba48b2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -161,11 +161,13 @@  mt7915_mcu_parse_response(struct mt76_dev *mdev, int cmd,
 	int ret = 0;
 
 	if (!skb) {
+		mdev->mcu_timeouts++;
 		dev_err(mdev->dev, "Message %08x (seq %d) timeout\n",
 			cmd, seq);
 		return -ETIMEDOUT;
 	}
 
+	mdev->mcu_timeouts = 0;
 	rxd = (struct mt76_connac2_mcu_rxd *)skb->data;
 	if (seq != rxd->seq &&
 	    !(rxd->eid == MCU_CMD_EXT_CID &&