diff options
author | Brian Norris <briannorris@chromium.org> | 2019-03-26 23:57:28 +0300 |
---|---|---|
committer | Kalle Valo <kvalo@codeaurora.org> | 2019-04-29 17:24:37 +0300 |
commit | 38faed150438be8d6e419137209d25439e6f4c33 (patch) | |
tree | 0037eed428f2e7d908d8aedbcc1c5d578ea7cf06 /drivers/net/wireless/ath/ath10k/pci.c | |
parent | b82d6c1f8f8288f744a9dcc16cd3085d535decca (diff) | |
download | linux-38faed150438be8d6e419137209d25439e6f4c33.tar.xz |
ath10k: perform crash dump collection in workqueue
Commit 25733c4e67df ("ath10k: pci: use mutex for diagnostic window CE
polling") introduced a regression where we try to sleep (grab a mutex)
in an atomic context:
[ 233.602619] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:254
[ 233.602626] in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0
[ 233.602636] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 5.1.0-rc2 #4
[ 233.602642] Hardware name: Google Scarlet (DT)
[ 233.602647] Call trace:
[ 233.602663] dump_backtrace+0x0/0x11c
[ 233.602672] show_stack+0x20/0x28
[ 233.602681] dump_stack+0x98/0xbc
[ 233.602690] ___might_sleep+0x154/0x16c
[ 233.602696] __might_sleep+0x78/0x88
[ 233.602704] mutex_lock+0x2c/0x5c
[ 233.602717] ath10k_pci_diag_read_mem+0x68/0x21c [ath10k_pci]
[ 233.602725] ath10k_pci_diag_read32+0x48/0x74 [ath10k_pci]
[ 233.602733] ath10k_pci_dump_registers+0x5c/0x16c [ath10k_pci]
[ 233.602741] ath10k_pci_fw_crashed_dump+0xb8/0x548 [ath10k_pci]
[ 233.602749] ath10k_pci_napi_poll+0x60/0x128 [ath10k_pci]
[ 233.602757] net_rx_action+0x140/0x388
[ 233.602766] __do_softirq+0x1b0/0x35c
[...]
ath10k_pci_fw_crashed_dump() is called from NAPI contexts, and firmware
memory dumps are retrieved using the diag memory interface.
A simple reproduction case is to run this on QCA6174A /
WLAN.RM.4.4.1-00132-QCARMSWP-1, which happens to be a way to b0rk the
firmware:
dd if=/sys/kernel/debug/ieee80211/phy0/ath10k/mem_value bs=4K count=1
of=/dev/null
(NB: simulated firmware crashes, via debugfs, don't trigger firmware
dumps.)
The fix is to move the crash-dump into a workqueue context, and avoid
relying on 'data_lock' for most mutual exclusion. We only keep using it
here for protecting 'fw_crash_counter', while the rest of the coredump
buffers are protected by a new 'dump_mutex'.
I've tested the above with simulated firmware crashes (debugfs 'reset'
file), real firmware crashes (the 'dd' command above), and a variety of
reboot and suspend/resume configurations on QCA6174A.
Reported here:
http://lkml.kernel.org/linux-wireless/20190325202706.GA68720@google.com
Fixes: 25733c4e67df ("ath10k: pci: use mutex for diagnostic window CE polling")
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Diffstat (limited to 'drivers/net/wireless/ath/ath10k/pci.c')
-rw-r--r-- | drivers/net/wireless/ath/ath10k/pci.c | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index 271f92c24d44..2c27f407a851 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -1441,7 +1441,7 @@ static void ath10k_pci_dump_registers(struct ath10k *ar, __le32 reg_dump_values[REG_DUMP_COUNT_QCA988X] = {}; int i, ret; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); ret = ath10k_pci_diag_read_hi(ar, ®_dump_values[0], hi_failure_state, @@ -1656,7 +1656,7 @@ static void ath10k_pci_dump_memory(struct ath10k *ar, int ret, i; u8 *buf; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); if (!crash_data) return; @@ -1734,14 +1734,19 @@ static void ath10k_pci_dump_memory(struct ath10k *ar, } } -static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) +static void ath10k_pci_fw_dump_work(struct work_struct *work) { + struct ath10k_pci *ar_pci = container_of(work, struct ath10k_pci, + dump_work); struct ath10k_fw_crash_data *crash_data; + struct ath10k *ar = ar_pci->ar; char guid[UUID_STRING_LEN + 1]; - spin_lock_bh(&ar->data_lock); + mutex_lock(&ar->dump_mutex); + spin_lock_bh(&ar->data_lock); ar->stats.fw_crash_counter++; + spin_unlock_bh(&ar->data_lock); crash_data = ath10k_coredump_new(ar); @@ -1756,11 +1761,18 @@ static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) ath10k_ce_dump_registers(ar, crash_data); ath10k_pci_dump_memory(ar, crash_data); - spin_unlock_bh(&ar->data_lock); + mutex_unlock(&ar->dump_mutex); queue_work(ar->workqueue, &ar->restart_work); } +static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) +{ + struct ath10k_pci *ar_pci = ath10k_pci_priv(ar); + + queue_work(ar->workqueue, &ar_pci->dump_work); +} + void ath10k_pci_hif_send_complete_check(struct ath10k *ar, u8 pipe, int force) { @@ -3442,6 +3454,8 @@ int ath10k_pci_setup_resource(struct ath10k *ar) spin_lock_init(&ar_pci->ps_lock); mutex_init(&ar_pci->ce_diag_mutex); + INIT_WORK(&ar_pci->dump_work, ath10k_pci_fw_dump_work); + timer_setup(&ar_pci->rx_post_retry, ath10k_pci_rx_replenish_retry, 0); if (QCA_REV_6174(ar) || QCA_REV_9377(ar)) |