diff mbox series

[v2,01/16] block: Add atomic write operations to request_queue limits

Message ID 20231212110844.19698-2-john.g.garry@oracle.com
State New
Headers show
Series block atomic writes | expand

Commit Message

John Garry Dec. 12, 2023, 11:08 a.m. UTC
From: Himanshu Madhani <himanshu.madhani@oracle.com>

Add the following limits:
- atomic_write_boundary_bytes
- atomic_write_max_bytes
- atomic_write_unit_max_bytes
- atomic_write_unit_min_bytes

All atomic writes limits are initialised to 0 to indicate no atomic write
support. Stacked devices are just not supported either for now.

Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
#jpg: Heavy rewrite
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 Documentation/ABI/stable/sysfs-block | 47 ++++++++++++++++++++++
 block/blk-settings.c                 | 60 ++++++++++++++++++++++++++++
 block/blk-sysfs.c                    | 33 +++++++++++++++
 include/linux/blkdev.h               | 37 +++++++++++++++++
 4 files changed, 177 insertions(+)

Comments

Ming Lei Dec. 13, 2023, 1:25 a.m. UTC | #1
On Tue, Dec 12, 2023 at 11:08:29AM +0000, John Garry wrote:
> From: Himanshu Madhani <himanshu.madhani@oracle.com>
> 
> Add the following limits:
> - atomic_write_boundary_bytes
> - atomic_write_max_bytes
> - atomic_write_unit_max_bytes
> - atomic_write_unit_min_bytes
> 
> All atomic writes limits are initialised to 0 to indicate no atomic write
> support. Stacked devices are just not supported either for now.
> 
> Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
> #jpg: Heavy rewrite
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  Documentation/ABI/stable/sysfs-block | 47 ++++++++++++++++++++++
>  block/blk-settings.c                 | 60 ++++++++++++++++++++++++++++
>  block/blk-sysfs.c                    | 33 +++++++++++++++
>  include/linux/blkdev.h               | 37 +++++++++++++++++
>  4 files changed, 177 insertions(+)
> 
> diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
> index 1fe9a553c37b..ba81a081522f 100644
> --- a/Documentation/ABI/stable/sysfs-block
> +++ b/Documentation/ABI/stable/sysfs-block
> @@ -21,6 +21,53 @@ Description:
>  		device is offset from the internal allocation unit's
>  		natural alignment.
>  
> +What:		/sys/block/<disk>/atomic_write_max_bytes
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter specifies the maximum atomic write
> +		size reported by the device. This parameter is relevant
> +		for merging of writes, where a merged atomic write
> +		operation must not exceed this number of bytes.
> +		The atomic_write_max_bytes may exceed the value in
> +		atomic_write_unit_max_bytes if atomic_write_max_bytes
> +		is not a power-of-two or atomic_write_unit_max_bytes is
> +		limited by some queue limits, such as max_segments.
> +
> +
> +What:		/sys/block/<disk>/atomic_write_unit_min_bytes
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter specifies the smallest block which can
> +		be written atomically with an atomic write operation. All
> +		atomic write operations must begin at a
> +		atomic_write_unit_min boundary and must be multiples of
> +		atomic_write_unit_min. This value must be a power-of-two.
> +
> +
> +What:		/sys/block/<disk>/atomic_write_unit_max_bytes
> +Date:		January 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter defines the largest block which can be
> +		written atomically with an atomic write operation. This
> +		value must be a multiple of atomic_write_unit_min and must
> +		be a power-of-two.
> +
> +
> +What:		/sys/block/<disk>/atomic_write_boundary_bytes
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] A device may need to internally split I/Os which
> +		straddle a given logical block address boundary. In that
> +		case a single atomic write operation will be processed as
> +		one of more sub-operations which each complete atomically.
> +		This parameter specifies the size in bytes of the atomic
> +		boundary if one is reported by the device. This value must
> +		be a power-of-two.
> +
>  
>  What:		/sys/block/<disk>/diskseq
>  Date:		February 2021
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 0046b447268f..d151be394c98 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -59,6 +59,10 @@ void blk_set_default_limits(struct queue_limits *lim)
>  	lim->zoned = BLK_ZONED_NONE;
>  	lim->zone_write_granularity = 0;
>  	lim->dma_alignment = 511;
> +	lim->atomic_write_unit_min_sectors = 0;
> +	lim->atomic_write_unit_max_sectors = 0;
> +	lim->atomic_write_max_sectors = 0;
> +	lim->atomic_write_boundary_sectors = 0;

Can we move the four into single structure and setup them in single
API? Then cross-validation can be done in this API.

>  }
>  
>  /**
> @@ -183,6 +187,62 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
>  }
>  EXPORT_SYMBOL(blk_queue_max_discard_sectors);
>  
> +/**
> + * blk_queue_atomic_write_max_bytes - set max bytes supported by
> + * the device for atomic write operations.
> + * @q:  the request queue for the device
> + * @size: maximum bytes supported
> + */
> +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> +				      unsigned int bytes)
> +{
> +	q->limits.atomic_write_max_sectors = bytes >> SECTOR_SHIFT;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);

What if driver doesn't call it but driver supports atomic write?

I guess the default max sectors should be atomic_write_unit_max_sectors
if the feature is enabled.

> +
> +/**
> + * blk_queue_atomic_write_boundary_bytes - Device's logical block address space
> + * which an atomic write should not cross.
> + * @q:  the request queue for the device
> + * @bytes: must be a power-of-two.
> + */
> +void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
> +					   unsigned int bytes)
> +{
> +	q->limits.atomic_write_boundary_sectors = bytes >> SECTOR_SHIFT;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes);

Default atomic_write_boundary_sectors should be
atomic_write_unit_max_sectors in case of atomic write?

> +
> +/**
> + * blk_queue_atomic_write_unit_min_sectors - smallest unit that can be written
> + * atomically to the device.
> + * @q:  the request queue for the device
> + * @sectors: must be a power-of-two.
> + */
> +void blk_queue_atomic_write_unit_min_sectors(struct request_queue *q,
> +					     unsigned int sectors)
> +{
> +	struct queue_limits *limits = &q->limits;
> +
> +	limits->atomic_write_unit_min_sectors = sectors;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_sectors);

atomic_write_unit_min_sectors should be >= (physical block size >> 9)
given the minimized atomic write unit is physical sector for all disk.

> +
> +/*
> + * blk_queue_atomic_write_unit_max_sectors - largest unit that can be written
> + * atomically to the device.
> + * @q: the request queue for the device
> + * @sectors: must be a power-of-two.
> + */
> +void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
> +					     unsigned int sectors)
> +{
> +	struct queue_limits *limits = &q->limits;
> +
> +	limits->atomic_write_unit_max_sectors = sectors;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_sectors);

atomic_write_unit_max_sectors should be >= atomic_write_unit_min_sectors.


Thanks, 
Ming
John Garry Dec. 13, 2023, 9:13 a.m. UTC | #2
>> +
>>   
>>   What:		/sys/block/<disk>/diskseq
>>   Date:		February 2021
>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>> index 0046b447268f..d151be394c98 100644
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -59,6 +59,10 @@ void blk_set_default_limits(struct queue_limits *lim)
>>   	lim->zoned = BLK_ZONED_NONE;
>>   	lim->zone_write_granularity = 0;
>>   	lim->dma_alignment = 511;
>> +	lim->atomic_write_unit_min_sectors = 0;
>> +	lim->atomic_write_unit_max_sectors = 0;
>> +	lim->atomic_write_max_sectors = 0;
>> +	lim->atomic_write_boundary_sectors = 0;
> 
> Can we move the four into single structure

There is no precedent for a similar structure in struct queue_limits. So 
would only passing a structure to the blk-settings.c API be ok?

> and setup them in single
> API? Then cross-validation can be done in this API.

I suppose so, if you think that it is better.

We rely on the driver to provide sound values. I suppose that we can 
sanitize them also (in a single API).

> 
>>   }
>>   
>>   /**
>> @@ -183,6 +187,62 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
>>   }
>>   EXPORT_SYMBOL(blk_queue_max_discard_sectors);
>>   
>> +/**
>> + * blk_queue_atomic_write_max_bytes - set max bytes supported by
>> + * the device for atomic write operations.
>> + * @q:  the request queue for the device
>> + * @size: maximum bytes supported
>> + */
>> +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
>> +				      unsigned int bytes)
>> +{
>> +	q->limits.atomic_write_max_sectors = bytes >> SECTOR_SHIFT;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
> 
> What if driver doesn't call it but driver supports atomic write?

We rely on the driver to do this. Any basic level of testing will show 
an issue if they don't.

> 
> I guess the default max sectors should be atomic_write_unit_max_sectors
> if the feature is enabled.

Sure. If we have a single API to set all values, then we don't need to 
worry about this (assuming the values are filled in properly).

> 
>> +
>> +/**
>> + * blk_queue_atomic_write_boundary_bytes - Device's logical block address space
>> + * which an atomic write should not cross.
>> + * @q:  the request queue for the device
>> + * @bytes: must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
>> +					   unsigned int bytes)
>> +{
>> +	q->limits.atomic_write_boundary_sectors = bytes >> SECTOR_SHIFT;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes);
> 
> Default atomic_write_boundary_sectors should be
> atomic_write_unit_max_sectors in case of atomic write?

Having atomic_write_boundary_sectors default to 
atomic_write_unit_max_sectors is effectively same as a default of 0.

> 
>> +
>> +/**
>> + * blk_queue_atomic_write_unit_min_sectors - smallest unit that can be written
>> + * atomically to the device.
>> + * @q:  the request queue for the device
>> + * @sectors: must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_unit_min_sectors(struct request_queue *q,
>> +					     unsigned int sectors)
>> +{
>> +	struct queue_limits *limits = &q->limits;
>> +
>> +	limits->atomic_write_unit_min_sectors = sectors;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_sectors);
> 
> atomic_write_unit_min_sectors should be >= (physical block size >> 9)
> given the minimized atomic write unit is physical sector for all disk.

For SCSI, we have a granularity VPD value, and when set we pay attention 
to that. If not, we use the phys block size.

For NVMe, we use the logical block size. For physical block size, that 
can be greater than the logical block size for npwg set, and I don't 
think it's suitable use that as minimum atomic write unit.

Anyway, I am not too keen on sanitizing this value in this way.

> 
>> +
>> +/*
>> + * blk_queue_atomic_write_unit_max_sectors - largest unit that can be written
>> + * atomically to the device.
>> + * @q: the request queue for the device
>> + * @sectors: must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
>> +					     unsigned int sectors)
>> +{
>> +	struct queue_limits *limits = &q->limits;
>> +
>> +	limits->atomic_write_unit_max_sectors = sectors;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_sectors);
> 
> atomic_write_unit_max_sectors should be >= atomic_write_unit_min_sectors.
> 

Again, we rely on the driver to provide sound values. However, as 
mentioned, we can sanitize.

Thanks,
John
Ming Lei Dec. 13, 2023, 12:28 p.m. UTC | #3
On Wed, Dec 13, 2023 at 09:13:48AM +0000, John Garry wrote:
> > > +
> > >   What:		/sys/block/<disk>/diskseq
> > >   Date:		February 2021
> > > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > > index 0046b447268f..d151be394c98 100644
> > > --- a/block/blk-settings.c
> > > +++ b/block/blk-settings.c
> > > @@ -59,6 +59,10 @@ void blk_set_default_limits(struct queue_limits *lim)
> > >   	lim->zoned = BLK_ZONED_NONE;
> > >   	lim->zone_write_granularity = 0;
> > >   	lim->dma_alignment = 511;
> > > +	lim->atomic_write_unit_min_sectors = 0;
> > > +	lim->atomic_write_unit_max_sectors = 0;
> > > +	lim->atomic_write_max_sectors = 0;
> > > +	lim->atomic_write_boundary_sectors = 0;
> > 
> > Can we move the four into single structure
> 
> There is no precedent for a similar structure in struct queue_limits. So
> would only passing a structure to the blk-settings.c API be ok?

Yes, this structure is part of the new API.

> 
> > and setup them in single
> > API? Then cross-validation can be done in this API.
> 
> I suppose so, if you think that it is better.
> 
> We rely on the driver to provide sound values. I suppose that we can
> sanitize them also (in a single API).

Please make the interface correct from beginning, and one good API is
helpful for both sides, such as isolating problems, easy to locate
bug, abstracting common logic, ...

And relying on API users is absolutely not good design.

> 
> > 
> > >   }
> > >   /**
> > > @@ -183,6 +187,62 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
> > >   }
> > >   EXPORT_SYMBOL(blk_queue_max_discard_sectors);
> > > +/**
> > > + * blk_queue_atomic_write_max_bytes - set max bytes supported by
> > > + * the device for atomic write operations.
> > > + * @q:  the request queue for the device
> > > + * @size: maximum bytes supported
> > > + */
> > > +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> > > +				      unsigned int bytes)
> > > +{
> > > +	q->limits.atomic_write_max_sectors = bytes >> SECTOR_SHIFT;
> > > +}
> > > +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
> > 
> > What if driver doesn't call it but driver supports atomic write?
> 
> We rely on the driver to do this. Any basic level of testing will show an
> issue if they don't.

Software quality depends on good requirement analysis, design and
implementation, instead of test.

Simply you can not cover all possibilities in test.

> 
> > 
> > I guess the default max sectors should be atomic_write_unit_max_sectors
> > if the feature is enabled.
> 
> Sure. If we have a single API to set all values, then we don't need to worry
> about this (assuming the values are filled in properly).
> 
> > 
> > > +
> > > +/**
> > > + * blk_queue_atomic_write_boundary_bytes - Device's logical block address space
> > > + * which an atomic write should not cross.
> > > + * @q:  the request queue for the device
> > > + * @bytes: must be a power-of-two.
> > > + */
> > > +void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
> > > +					   unsigned int bytes)
> > > +{
> > > +	q->limits.atomic_write_boundary_sectors = bytes >> SECTOR_SHIFT;
> > > +}
> > > +EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes);
> > 
> > Default atomic_write_boundary_sectors should be
> > atomic_write_unit_max_sectors in case of atomic write?
> 
> Having atomic_write_boundary_sectors default to
> atomic_write_unit_max_sectors is effectively same as a default of 0.
> 
> > 
> > > +
> > > +/**
> > > + * blk_queue_atomic_write_unit_min_sectors - smallest unit that can be written
> > > + * atomically to the device.
> > > + * @q:  the request queue for the device
> > > + * @sectors: must be a power-of-two.
> > > + */
> > > +void blk_queue_atomic_write_unit_min_sectors(struct request_queue *q,
> > > +					     unsigned int sectors)
> > > +{
> > > +	struct queue_limits *limits = &q->limits;
> > > +
> > > +	limits->atomic_write_unit_min_sectors = sectors;
> > > +}
> > > +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_sectors);
> > 
> > atomic_write_unit_min_sectors should be >= (physical block size >> 9)
> > given the minimized atomic write unit is physical sector for all disk.
> 
> For SCSI, we have a granularity VPD value, and when set we pay attention to
> that. If not, we use the phys block size.
> 
> For NVMe, we use the logical block size. For physical block size, that can
> be greater than the logical block size for npwg set, and I don't think it's
> suitable use that as minimum atomic write unit.

I highly suspect it is wrong to use logical block size as minimum
support atomic write unit, given physical block size is supposed to
be the minimum atomic write unit.

> 
> Anyway, I am not too keen on sanitizing this value in this way.
> 
> > 
> > > +
> > > +/*
> > > + * blk_queue_atomic_write_unit_max_sectors - largest unit that can be written
> > > + * atomically to the device.
> > > + * @q: the request queue for the device
> > > + * @sectors: must be a power-of-two.
> > > + */
> > > +void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
> > > +					     unsigned int sectors)
> > > +{
> > > +	struct queue_limits *limits = &q->limits;
> > > +
> > > +	limits->atomic_write_unit_max_sectors = sectors;
> > > +}
> > > +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_sectors);
> > 
> > atomic_write_unit_max_sectors should be >= atomic_write_unit_min_sectors.
> > 
> 
> Again, we rely on the driver to provide sound values. However, as mentioned,
> we can sanitize.

Relying on driver to provide sound value is absolutely bad design from API
viewpoint.

Thanks,
Ming
John Garry Dec. 13, 2023, 7:01 p.m. UTC | #4
On 13/12/2023 12:28, Ming Lei wrote:
>> For NVMe, we use the logical block size. For physical block size, that can
>> be greater than the logical block size for npwg set, and I don't think it's
>> suitable use that as minimum atomic write unit.
> I highly suspect it is wrong to use logical block size as minimum
> support atomic write unit, given physical block size is supposed to
> be the minimum atomic write unit.

I would tend to agree, but I am still a bit curious on how npwg is used 
to calculate atomic_bs/phys_bs as it seems to be a recommended 
performance-related value. It would hint to me that it is the phys_bs, 
but is that same as atomic min granularity?

> 
>> Anyway, I am not too keen on sanitizing this value in this way.
>>
>>>> +
>>>> +/*
>>>> + * blk_queue_atomic_write_unit_max_sectors - largest unit that can be written
>>>> + * atomically to the device.
>>>> + * @q: the request queue for the device
>>>> + * @sectors: must be a power-of-two.
>>>> + */
>>>> +void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
>>>> +					     unsigned int sectors)
>>>> +{
>>>> +	struct queue_limits *limits = &q->limits;
>>>> +
>>>> +	limits->atomic_write_unit_max_sectors = sectors;
>>>> +}
>>>> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_sectors);
>>> atomic_write_unit_max_sectors should be >= atomic_write_unit_min_sectors.
>>>
>> Again, we rely on the driver to provide sound values. However, as mentioned,
>> we can sanitize.
> Relying on driver to provide sound value is absolutely bad design from API
> viewpoint.

OK, fine, I'll look to revise the API to make it more robust.

Thanks,
John
Martin K. Petersen Dec. 14, 2023, 4:34 a.m. UTC | #5
Hi Ming!

>> +	lim->atomic_write_unit_min_sectors = 0;
>> +	lim->atomic_write_unit_max_sectors = 0;
>> +	lim->atomic_write_max_sectors = 0;
>> +	lim->atomic_write_boundary_sectors = 0;
>
> Can we move the four into single structure and setup them in single
> API? Then cross-validation can be done in this API.

Why would we put them in a separate struct? We don't do that for any of
the other queue_limits.
Martin K. Petersen Dec. 14, 2023, 4:38 a.m. UTC | #6
Ming,

> Relying on driver to provide sound value is absolutely bad design from
> API viewpoint.

All the other queue_limits are validated by the LLDs. It's challenging
to lift that validation to the block layer since the values reported are
heavily protocol-dependent and thus information is lost if we do it
somewhere else.
Ming Lei Dec. 14, 2023, 1:46 p.m. UTC | #7
Hi Martin,

On Wed, Dec 13, 2023 at 11:38:10PM -0500, Martin K. Petersen wrote:
> 
> Ming,

On Thu, Dec 14, 2023 at 12:35 PM Martin K. Petersen <martin.petersen@oracle.com> wrote:
>
>
> Hi Ming!
>
> >> +    lim->atomic_write_unit_min_sectors = 0;
> >> +    lim->atomic_write_unit_max_sectors = 0;
> >> +    lim->atomic_write_max_sectors = 0;
> >> +    lim->atomic_write_boundary_sectors = 0;
> >
> > Can we move the four into single structure and setup them in single
> > API? Then cross-validation can be done in this API.
>
> Why would we put them in a separate struct? We don't do that for any of
> the other queue_limits.

All the four limits are for same purpose of supporting atomic-write, and
there can many benefits to define single API to setup atomic parameters:

1) common logic can be put into single place, such as running
cross-validation among them and setting up default value, and it is impossible
to do that by the way in this patch

2) all limits are supposed to setup once by driver in same place, so
doing them in single API actually simplifies driver and block layer, and
API itself becomes less fragile

3) it is easier for trace or troubleshoot

> 
> > Relying on driver to provide sound value is absolutely bad design from
> > API viewpoint.
> 
> All the other queue_limits are validated by the LLDs. It's challenging
> to lift that validation to the block layer since the values reported are
> heavily protocol-dependent and

After atomic limits are put into block layer, it becomes not driver
specific any more, scsi and nvme are going to support it first, sooner
or later, other drivers(dm & md, loop or ublk, ...) may try to support it.

Also in theory, they are not protocol-dependent, usually physical block size is
the minimized atomic-write unit, now John's patches are trying to support
bigger atomic-write unit as scsi/nvme's protocol supports it, and the concept
is actually common in disk. Similar in implementation level too, such as,
for NAND flash based storage, I guess atomic-write should be supported by atomic
update of FTL's LBA/PBA mapping.

> thus information is lost if we do it somewhere else.

Block layer is only focusing on common logic, such as the four limits
added in request queue, which are consumed by block layer and related with
other generic limits(physical block size, max IO size), and it won't be
same with driver's internal validation.


Thanks,
Ming
Christoph Hellwig Dec. 14, 2023, 4:12 p.m. UTC | #8
On Wed, Dec 13, 2023 at 09:25:38AM +0800, Ming Lei wrote:

<full quote deleted, please only quote what is relevant.


> >  	lim->dma_alignment = 511;
> > +	lim->atomic_write_unit_min_sectors = 0;
> > +	lim->atomic_write_unit_max_sectors = 0;
> > +	lim->atomic_write_max_sectors = 0;
> > +	lim->atomic_write_boundary_sectors = 0;
> 
> Can we move the four into single structure and setup them in single
> API? Then cross-validation can be done in this API.

Please don't be arbitrarily different from all the other limits.  What
we really should be doing is an API that updates all limits at the
same time, and I actually have code for that, I'll just need to finish
it.  I do not want to block this series for it, though.
diff mbox series

Patch

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 1fe9a553c37b..ba81a081522f 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -21,6 +21,53 @@  Description:
 		device is offset from the internal allocation unit's
 		natural alignment.
 
+What:		/sys/block/<disk>/atomic_write_max_bytes
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the maximum atomic write
+		size reported by the device. This parameter is relevant
+		for merging of writes, where a merged atomic write
+		operation must not exceed this number of bytes.
+		The atomic_write_max_bytes may exceed the value in
+		atomic_write_unit_max_bytes if atomic_write_max_bytes
+		is not a power-of-two or atomic_write_unit_max_bytes is
+		limited by some queue limits, such as max_segments.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_min_bytes
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the smallest block which can
+		be written atomically with an atomic write operation. All
+		atomic write operations must begin at a
+		atomic_write_unit_min boundary and must be multiples of
+		atomic_write_unit_min. This value must be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_max_bytes
+Date:		January 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter defines the largest block which can be
+		written atomically with an atomic write operation. This
+		value must be a multiple of atomic_write_unit_min and must
+		be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_boundary_bytes
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] A device may need to internally split I/Os which
+		straddle a given logical block address boundary. In that
+		case a single atomic write operation will be processed as
+		one of more sub-operations which each complete atomically.
+		This parameter specifies the size in bytes of the atomic
+		boundary if one is reported by the device. This value must
+		be a power-of-two.
+
 
 What:		/sys/block/<disk>/diskseq
 Date:		February 2021
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b447268f..d151be394c98 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -59,6 +59,10 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->zoned = BLK_ZONED_NONE;
 	lim->zone_write_granularity = 0;
 	lim->dma_alignment = 511;
+	lim->atomic_write_unit_min_sectors = 0;
+	lim->atomic_write_unit_max_sectors = 0;
+	lim->atomic_write_max_sectors = 0;
+	lim->atomic_write_boundary_sectors = 0;
 }
 
 /**
@@ -183,6 +187,62 @@  void blk_queue_max_discard_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
+/**
+ * blk_queue_atomic_write_max_bytes - set max bytes supported by
+ * the device for atomic write operations.
+ * @q:  the request queue for the device
+ * @size: maximum bytes supported
+ */
+void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+				      unsigned int bytes)
+{
+	q->limits.atomic_write_max_sectors = bytes >> SECTOR_SHIFT;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
+
+/**
+ * blk_queue_atomic_write_boundary_bytes - Device's logical block address space
+ * which an atomic write should not cross.
+ * @q:  the request queue for the device
+ * @bytes: must be a power-of-two.
+ */
+void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
+					   unsigned int bytes)
+{
+	q->limits.atomic_write_boundary_sectors = bytes >> SECTOR_SHIFT;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes);
+
+/**
+ * blk_queue_atomic_write_unit_min_sectors - smallest unit that can be written
+ * atomically to the device.
+ * @q:  the request queue for the device
+ * @sectors: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_min_sectors(struct request_queue *q,
+					     unsigned int sectors)
+{
+	struct queue_limits *limits = &q->limits;
+
+	limits->atomic_write_unit_min_sectors = sectors;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_sectors);
+
+/*
+ * blk_queue_atomic_write_unit_max_sectors - largest unit that can be written
+ * atomically to the device.
+ * @q: the request queue for the device
+ * @sectors: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
+					     unsigned int sectors)
+{
+	struct queue_limits *limits = &q->limits;
+
+	limits->atomic_write_unit_max_sectors = sectors;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_sectors);
+
 /**
  * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
  * @q:  the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0b2d04766324..4ebf148cf356 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -118,6 +118,30 @@  static ssize_t queue_max_discard_segments_show(struct request_queue *q,
 	return queue_var_show(queue_max_discard_segments(q), page);
 }
 
+static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_max_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_boundary_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_boundary_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_min_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_max_bytes(q), page);
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, page);
@@ -507,6 +531,11 @@  QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
 QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
 QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 
+QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
@@ -634,6 +663,10 @@  static struct attribute *queue_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_atomic_write_max_bytes_entry.attr,
+	&queue_atomic_write_boundary_entry.attr,
+	&queue_atomic_write_unit_min_entry.attr,
+	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..ab53163dd187 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -309,6 +309,11 @@  struct queue_limits {
 	unsigned int		discard_alignment;
 	unsigned int		zone_write_granularity;
 
+	unsigned int		atomic_write_boundary_sectors;
+	unsigned int		atomic_write_max_sectors;
+	unsigned int		atomic_write_unit_min_sectors;
+	unsigned int		atomic_write_unit_max_sectors;
+
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
 	unsigned short		max_discard_segments;
@@ -908,6 +913,14 @@  void blk_queue_zone_write_granularity(struct request_queue *q,
 				      unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
+void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+				unsigned int bytes);
+void blk_queue_atomic_write_unit_max_sectors(struct request_queue *q,
+				unsigned int sectors);
+void blk_queue_atomic_write_unit_min_sectors(struct request_queue *q,
+				unsigned int sectors);
+void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
+				unsigned int bytes);
 void disk_update_readahead(struct gendisk *disk);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
@@ -1312,6 +1325,30 @@  static inline int queue_dma_alignment(const struct request_queue *q)
 	return q ? q->limits.dma_alignment : 511;
 }
 
+static inline unsigned int
+queue_atomic_write_unit_max_bytes(const struct request_queue *q)
+{
+	return q->limits.atomic_write_unit_max_sectors << SECTOR_SHIFT;
+}
+
+static inline unsigned int
+queue_atomic_write_unit_min_bytes(const struct request_queue *q)
+{
+	return q->limits.atomic_write_unit_min_sectors << SECTOR_SHIFT;
+}
+
+static inline unsigned int
+queue_atomic_write_boundary_bytes(const struct request_queue *q)
+{
+	return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT;
+}
+
+static inline unsigned int
+queue_atomic_write_max_bytes(const struct request_queue *q)
+{
+	return q->limits.atomic_write_max_sectors << SECTOR_SHIFT;
+}
+
 static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
 {
 	return queue_dma_alignment(bdev_get_queue(bdev));