From 3e1d0452edceebb903d23db53201013c940bf000 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 18 May 2019 20:39:55 +0800 Subject: ceph: avoid iput_final() while holding mutex or in dispatch thread iput_final() may wait for reahahead pages. The wait can cause deadlock. For example: Workqueue: ceph-msgr ceph_con_workfn [libceph] Call Trace: schedule+0x36/0x80 io_schedule+0x16/0x40 __lock_page+0x101/0x140 truncate_inode_pages_range+0x556/0x9f0 truncate_inode_pages_final+0x4d/0x60 evict+0x182/0x1a0 iput+0x1d2/0x220 iterate_session_caps+0x82/0x230 [ceph] dispatch+0x678/0xa80 [ceph] ceph_con_workfn+0x95b/0x1560 [libceph] process_one_work+0x14d/0x410 worker_thread+0x4b/0x460 kthread+0x105/0x140 ret_from_fork+0x22/0x40 Workqueue: ceph-msgr ceph_con_workfn [libceph] Call Trace: __schedule+0x3d6/0x8b0 schedule+0x36/0x80 schedule_preempt_disabled+0xe/0x10 mutex_lock+0x2f/0x40 ceph_check_caps+0x505/0xa80 [ceph] ceph_put_wrbuffer_cap_refs+0x1e5/0x2c0 [ceph] writepages_finish+0x2d3/0x410 [ceph] __complete_request+0x26/0x60 [libceph] handle_reply+0x6c8/0xa10 [libceph] dispatch+0x29a/0xbb0 [libceph] ceph_con_workfn+0x95b/0x1560 [libceph] process_one_work+0x14d/0x410 worker_thread+0x4b/0x460 kthread+0x105/0x140 ret_from_fork+0x22/0x40 In above example, truncate_inode_pages_range() waits for readahead pages while holding s_mutex. ceph_check_caps() waits for s_mutex and blocks OSD dispatch thread. Later OSD replies (for readahead) can't be handled. ceph_check_caps() also may lock snap_rwsem for read. So similar deadlock can happen if iput_final() is called while holding snap_rwsem. In general, it's not good to call iput_final() inside MDS/OSD dispatch threads or while holding any mutex. The fix is introducing ceph_async_iput(), which calls iput_final() in workqueue. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs/ceph/snap.c') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b26e12cd8ec3..72c6c022f02b 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -648,13 +648,15 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - iput(lastinode); + /* avoid calling iput_final() while holding + * mdsc->snap_rwsem or in mds dispatch threads */ + ceph_async_iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - iput(lastinode); + ceph_async_iput(lastinode); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -806,7 +808,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); - iput(inode); + /* avoid calling iput_final() while holding + * session->s_mutex or in mds dispatch threads */ + ceph_async_iput(inode); spin_lock(&mdsc->snap_flush_lock); } spin_unlock(&mdsc->snap_flush_lock); @@ -950,12 +954,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_get_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, oldrealm); - iput(inode); + /* avoid calling iput_final() while holding + * mdsc->snap_rwsem or mds in dispatch threads */ + ceph_async_iput(inode); continue; skip_inode: spin_unlock(&ci->i_ceph_lock); - iput(inode); + ceph_async_iput(inode); } /* we may have taken some of the old realm's children. */ -- cgit v1.2.3