diff mbox series

[v2,2/2] ceph: add ceph_cap_unlink_work to fire check caps immediately

Message ID 20231025061952.288730-3-xiubli@redhat.com
State New
Headers show
Series ceph: fix caps revocation stuck | expand

Commit Message

Xiubo Li Oct. 25, 2023, 6:19 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

When unlinking a file the check caps could be delayed for more than
5 seconds, but in MDS side it maybe waiting for the clients to
release caps.

This will add a dedicated work queue and list to help trigger to
fire the check caps and dirty buffer flushing immediately.

URL: https://tracker.ceph.com/issues/50223
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c       | 17 ++++++++++++++++-
 fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.h |  4 ++++
 3 files changed, 54 insertions(+), 1 deletion(-)

Comments

Ilya Dryomov Jan. 15, 2024, 10:44 p.m. UTC | #1
On Wed, Oct 25, 2023 at 8:22 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> When unlinking a file the check caps could be delayed for more than
> 5 seconds, but in MDS side it maybe waiting for the clients to
> release caps.
>
> This will add a dedicated work queue and list to help trigger to
> fire the check caps and dirty buffer flushing immediately.
>
> URL: https://tracker.ceph.com/issues/50223
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/caps.c       | 17 ++++++++++++++++-
>  fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++++
>  fs/ceph/mds_client.h |  4 ++++
>  3 files changed, 54 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 9b9ec1adc19d..be4f986e082d 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -4790,7 +4790,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
>                 if (__ceph_caps_dirty(ci)) {
>                         struct ceph_mds_client *mdsc =
>                                 ceph_inode_to_fs_client(inode)->mdsc;
> -                       __cap_delay_requeue_front(mdsc, ci);
> +
> +                       doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
> +                             ceph_vinop(inode));
> +                       spin_lock(&mdsc->cap_unlink_delay_lock);
> +                       ci->i_ceph_flags |= CEPH_I_FLUSH;
> +                       if (!list_empty(&ci->i_cap_delay_list))
> +                               list_del_init(&ci->i_cap_delay_list);
> +                       list_add_tail(&ci->i_cap_delay_list,
> +                                     &mdsc->cap_unlink_delay_list);
> +                       spin_unlock(&mdsc->cap_unlink_delay_lock);
> +
> +                       /*
> +                        * Fire the work immediately, because the MDS maybe
> +                        * waiting for caps release.
> +                        */
> +                       schedule_work(&mdsc->cap_unlink_work);

Hi Xiubo,

This schedules a work an a system-wide workqueue, not specific to
CephFS.  Is there something that ensures that it gets flushed as part
of unmount and possibly on other occasions that have to do with
individual inodes?

Thanks,

                Ilya
Xiubo Li Jan. 16, 2024, 12:24 a.m. UTC | #2
On 1/16/24 06:44, Ilya Dryomov wrote:
> On Wed, Oct 25, 2023 at 8:22 AM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> When unlinking a file the check caps could be delayed for more than
>> 5 seconds, but in MDS side it maybe waiting for the clients to
>> release caps.
>>
>> This will add a dedicated work queue and list to help trigger to
>> fire the check caps and dirty buffer flushing immediately.
>>
>> URL: https://tracker.ceph.com/issues/50223
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c       | 17 ++++++++++++++++-
>>   fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++++
>>   fs/ceph/mds_client.h |  4 ++++
>>   3 files changed, 54 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 9b9ec1adc19d..be4f986e082d 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -4790,7 +4790,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
>>                  if (__ceph_caps_dirty(ci)) {
>>                          struct ceph_mds_client *mdsc =
>>                                  ceph_inode_to_fs_client(inode)->mdsc;
>> -                       __cap_delay_requeue_front(mdsc, ci);
>> +
>> +                       doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
>> +                             ceph_vinop(inode));
>> +                       spin_lock(&mdsc->cap_unlink_delay_lock);
>> +                       ci->i_ceph_flags |= CEPH_I_FLUSH;
>> +                       if (!list_empty(&ci->i_cap_delay_list))
>> +                               list_del_init(&ci->i_cap_delay_list);
>> +                       list_add_tail(&ci->i_cap_delay_list,
>> +                                     &mdsc->cap_unlink_delay_list);
>> +                       spin_unlock(&mdsc->cap_unlink_delay_lock);
>> +
>> +                       /*
>> +                        * Fire the work immediately, because the MDS maybe
>> +                        * waiting for caps release.
>> +                        */
>> +                       schedule_work(&mdsc->cap_unlink_work);
> Hi Xiubo,
>
> This schedules a work an a system-wide workqueue, not specific to
> CephFS.  Is there something that ensures that it gets flushed as part
> of unmount and possibly on other occasions that have to do with
> individual inodes?

Hi Ilya,

There seems no, I didn't find any.

Let me add a dedicated workqueue for this.

Thanks

- Xiubo


> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9b9ec1adc19d..be4f986e082d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4790,7 +4790,22 @@  int ceph_drop_caps_for_unlink(struct inode *inode)
 		if (__ceph_caps_dirty(ci)) {
 			struct ceph_mds_client *mdsc =
 				ceph_inode_to_fs_client(inode)->mdsc;
-			__cap_delay_requeue_front(mdsc, ci);
+
+			doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+			      ceph_vinop(inode));
+			spin_lock(&mdsc->cap_unlink_delay_lock);
+			ci->i_ceph_flags |= CEPH_I_FLUSH;
+			if (!list_empty(&ci->i_cap_delay_list))
+				list_del_init(&ci->i_cap_delay_list);
+			list_add_tail(&ci->i_cap_delay_list,
+				      &mdsc->cap_unlink_delay_list);
+			spin_unlock(&mdsc->cap_unlink_delay_lock);
+
+			/*
+			 * Fire the work immediately, because the MDS maybe
+			 * waiting for caps release.
+			 */
+			schedule_work(&mdsc->cap_unlink_work);
 		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6fa7134beec8..a7bffb030036 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2500,6 +2500,37 @@  void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
 	}
 }
 
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, cap_unlink_work);
+	struct ceph_client *cl = mdsc->fsc->client;
+
+	doutc(cl, "begin\n");
+	spin_lock(&mdsc->cap_unlink_delay_lock);
+	while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+		struct ceph_inode_info *ci;
+		struct inode *inode;
+
+		ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		list_del_init(&ci->i_cap_delay_list);
+
+		inode = igrab(&ci->netfs.inode);
+		if (inode) {
+			spin_unlock(&mdsc->cap_unlink_delay_lock);
+			doutc(cl, "on %p %llx.%llx\n", inode,
+			      ceph_vinop(inode));
+			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+			iput(inode);
+			spin_lock(&mdsc->cap_unlink_delay_lock);
+		}
+	}
+	spin_unlock(&mdsc->cap_unlink_delay_lock);
+	doutc(cl, "done\n");
+}
+
 /*
  * requests
  */
@@ -5372,6 +5403,8 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
 	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
+	spin_lock_init(&mdsc->cap_unlink_delay_lock);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
 	spin_lock_init(&mdsc->snap_flush_lock);
 	mdsc->last_cap_flush_tid = 1;
@@ -5380,6 +5413,7 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+	INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
 	err = ceph_metric_init(&mdsc->metric);
 	if (err)
 		goto err_mdsmap;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2e6ddaa13d72..f25117fa910f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -462,6 +462,8 @@  struct ceph_mds_client {
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
 	struct list_head cap_delay_list;   /* caps with delayed release */
 	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head cap_unlink_delay_list;  /* caps with delayed release for unlink */
+	spinlock_t       cap_unlink_delay_lock;  /* protects cap_unlink_delay_list */
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 	spinlock_t       snap_flush_lock;
 
@@ -475,6 +477,8 @@  struct ceph_mds_client {
 	struct work_struct cap_reclaim_work;
 	atomic_t	   cap_reclaim_pending;
 
+	struct work_struct cap_unlink_work;
+
 	/*
 	 * Cap reservations
 	 *