Lines Matching +full:write +full:- +full:to +full:- +full:write

1 // SPDX-License-Identifier: GPL-2.0
16 #include <linux/blk-mq.h>
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
42 * Per-zone write plug.
44 * @link: To list the plug in the zone write plug error list of the disk.
45 * @ref: Zone write plug reference counter. A zone write plug reference is
48 * submitted and when a function needs to manipulate a plug. The
51 * reference is dropped whenever the zone of the zone write plug is reset,
52 * finished and when the zone becomes full (last write BIO to the zone
54 * @lock: Spinlock to atomically manipulate the plug.
57 * @wp_offset: The zone write pointer location relative to the start of the zone
60 * @bio_work: Work struct to handle issuing of plugged BIOs
61 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
62 * @disk: The gendisk the plug belongs to.
79 * Zone write plug flags bits:
80 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
81 * that is, that write BIOs are being throttled due to a write BIO already
82 * being executed or the zone write plug bio list is not empty.
83 * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
84 * recovered with a report zone to update the zone write pointer offset.
85 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
86 * from the disk hash table and that the initial reference to the zone
87 * write plug set when the plug was first added to the hash table has been
89 * to prevent new references to the zone write plug to be taken for
90 * newly incoming BIOs. A zone write plug flagged with this flag will be
100 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
103 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
119 * blkdev_report_zones - Get zones information
121 * @sector: Sector from which to report zones
122 * @nr_zones: Maximum number of zones to report
129 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
130 * constant can be passed to @nr_zones.
134 * Note: The caller must use memalloc_noXX_save/restore() calls to control
140 struct gendisk *disk = bdev->bd_disk; in blkdev_report_zones()
143 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) in blkdev_report_zones()
144 return -EOPNOTSUPP; in blkdev_report_zones()
149 return disk->fops->report_zones(disk, sector, nr_zones, cb, data); in blkdev_report_zones()
162 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
164 * @op: Operation to be performed on the zones
165 * @sector: Start sector of the first zone to operate on
173 * The operation to execute on each zone can be a zone reset, open, close
186 return -EOPNOTSUPP; in blkdev_zone_mgmt()
189 return -EPERM; in blkdev_zone_mgmt()
192 return -EOPNOTSUPP; in blkdev_zone_mgmt()
196 return -EINVAL; in blkdev_zone_mgmt()
200 return -EINVAL; in blkdev_zone_mgmt()
203 return -EINVAL; in blkdev_zone_mgmt()
214 bio->bi_iter.bi_sector = sector; in blkdev_zone_mgmt()
217 /* This may take a while, so be nice to others */ in blkdev_zone_mgmt()
237 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) in blkdev_copy_zone_to_user()
238 return -EFAULT; in blkdev_copy_zone_to_user()
255 return -EINVAL; in blkdev_report_zones_ioctl()
258 return -ENOTTY; in blkdev_report_zones_ioctl()
261 return -EFAULT; in blkdev_report_zones_ioctl()
264 return -EINVAL; in blkdev_report_zones_ioctl()
275 return -EFAULT; in blkdev_report_zones_ioctl()
284 if (zrange->sector + zrange->nr_sectors <= zrange->sector || in blkdev_truncate_zone_range()
285 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) in blkdev_truncate_zone_range()
287 return -EINVAL; in blkdev_truncate_zone_range()
289 start = zrange->sector << SECTOR_SHIFT; in blkdev_truncate_zone_range()
290 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; in blkdev_truncate_zone_range()
308 return -EINVAL; in blkdev_zone_mgmt_ioctl()
311 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
314 return -EBADF; in blkdev_zone_mgmt_ioctl()
317 return -EFAULT; in blkdev_zone_mgmt_ioctl()
324 filemap_invalidate_lock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
339 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
346 filemap_invalidate_unlock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
353 if (!disk->conv_zones_bitmap) in disk_zone_is_conv()
355 return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); in disk_zone_is_conv()
360 return zone->start + zone->len >= get_capacity(disk); in disk_zone_is_last()
366 if (zno < disk->nr_zones - 1) in disk_zone_is_full()
367 return offset_in_zone >= disk->zone_capacity; in disk_zone_is_full()
368 return offset_in_zone >= disk->last_zone_capacity; in disk_zone_is_full()
374 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); in disk_zone_wplug_is_full()
383 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); in disk_insert_zone_wplug()
386 * Add the new zone write plug to the hash table, but carefully as we in disk_insert_zone_wplug()
388 * zone write plug for the same zone. in disk_insert_zone_wplug()
390 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
391 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { in disk_insert_zone_wplug()
392 if (zwplg->zone_no == zwplug->zone_no) { in disk_insert_zone_wplug()
393 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
397 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); in disk_insert_zone_wplug()
398 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
407 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); in disk_get_zone_wplug()
412 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { in disk_get_zone_wplug()
413 if (zwplug->zone_no == zno && in disk_get_zone_wplug()
414 atomic_inc_not_zero(&zwplug->ref)) { in disk_get_zone_wplug()
430 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); in disk_free_zone_wplug_rcu()
435 if (atomic_dec_and_test(&zwplug->ref)) { in disk_put_zone_wplug()
436 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); in disk_put_zone_wplug()
437 WARN_ON_ONCE(!list_empty(&zwplug->link)); in disk_put_zone_wplug()
438 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); in disk_put_zone_wplug()
440 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); in disk_put_zone_wplug()
447 /* If the zone write plug was already removed, we are done. */ in disk_should_remove_zone_wplug()
448 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_should_remove_zone_wplug()
451 /* If the zone write plug is still busy, it cannot be removed. */ in disk_should_remove_zone_wplug()
452 if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) in disk_should_remove_zone_wplug()
460 * should not attempt to remove the zone write plug until all BIO in disk_should_remove_zone_wplug()
461 * completions are seen. Check by looking at the zone write plug in disk_should_remove_zone_wplug()
466 if (atomic_read(&zwplug->ref) > 2) in disk_should_remove_zone_wplug()
469 /* We can remove zone write plugs for zones that are empty or full. */ in disk_should_remove_zone_wplug()
470 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); in disk_should_remove_zone_wplug()
478 /* If the zone write plug was already removed, we have nothing to do. */ in disk_remove_zone_wplug()
479 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_remove_zone_wplug()
483 * Mark the zone write plug as unhashed and drop the extra reference we in disk_remove_zone_wplug()
486 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; in disk_remove_zone_wplug()
487 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
488 hlist_del_init_rcu(&zwplug->node); in disk_remove_zone_wplug()
489 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
496 * Get a reference on the write plug for the zone containing @sector.
498 * Return a pointer to the zone write plug with the plug spinlock held.
512 * operation has not already removed the zone write plug from in disk_get_and_lock_zone_wplug()
514 * we need to get a new plug so start over from the beginning. in disk_get_and_lock_zone_wplug()
516 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
517 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { in disk_get_and_lock_zone_wplug()
518 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
526 * Allocate and initialize a zone write plug with an extra reference in disk_get_and_lock_zone_wplug()
527 * so that it is not freed when the zone write plug becomes idle without in disk_get_and_lock_zone_wplug()
530 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); in disk_get_and_lock_zone_wplug()
534 INIT_HLIST_NODE(&zwplug->node); in disk_get_and_lock_zone_wplug()
535 INIT_LIST_HEAD(&zwplug->link); in disk_get_and_lock_zone_wplug()
536 atomic_set(&zwplug->ref, 2); in disk_get_and_lock_zone_wplug()
537 spin_lock_init(&zwplug->lock); in disk_get_and_lock_zone_wplug()
538 zwplug->flags = 0; in disk_get_and_lock_zone_wplug()
539 zwplug->zone_no = zno; in disk_get_and_lock_zone_wplug()
540 zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); in disk_get_and_lock_zone_wplug()
541 bio_list_init(&zwplug->bio_list); in disk_get_and_lock_zone_wplug()
542 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); in disk_get_and_lock_zone_wplug()
543 zwplug->disk = disk; in disk_get_and_lock_zone_wplug()
545 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
548 * Insert the new zone write plug in the hash table. This can fail only in disk_get_and_lock_zone_wplug()
553 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
554 mempool_free(zwplug, disk->zone_wplugs_pool); in disk_get_and_lock_zone_wplug()
564 struct request_queue *q = zwplug->disk->queue; in blk_zone_wplug_bio_io_error()
573 * Abort (fail) all plugged BIOs of a zone write plug.
579 while ((bio = bio_list_pop(&zwplug->bio_list))) in disk_zone_wplug_abort()
584 * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
585 * with the assumed write pointer location of the zone when the BIO will
591 unsigned int wp_offset = zwplug->wp_offset; in disk_zone_wplug_abort_unaligned()
595 while ((bio = bio_list_pop(&zwplug->bio_list))) { in disk_zone_wplug_abort_unaligned()
596 if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || in disk_zone_wplug_abort_unaligned()
607 bio_list_merge(&zwplug->bio_list, &bl); in disk_zone_wplug_abort_unaligned()
615 if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) in disk_zone_wplug_set_error()
619 * At this point, we already have a reference on the zone write plug. in disk_zone_wplug_set_error()
620 * However, since we are going to add the plug to the disk zone write in disk_zone_wplug_set_error()
626 zwplug->flags |= BLK_ZONE_WPLUG_ERROR; in disk_zone_wplug_set_error()
627 atomic_inc(&zwplug->ref); in disk_zone_wplug_set_error()
629 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_zone_wplug_set_error()
630 list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); in disk_zone_wplug_set_error()
631 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_zone_wplug_set_error()
639 if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) in disk_zone_wplug_clear_error()
644 * on the zone write plug after handling the error state. So remove the in disk_zone_wplug_clear_error()
646 * error handling has not yet started, that is, if the zone write plug in disk_zone_wplug_clear_error()
649 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_zone_wplug_clear_error()
650 if (!list_empty(&zwplug->link)) { in disk_zone_wplug_clear_error()
651 list_del_init(&zwplug->link); in disk_zone_wplug_clear_error()
652 zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; in disk_zone_wplug_clear_error()
655 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_zone_wplug_clear_error()
659 * Set a zone write plug write pointer offset to either 0 (zone reset case)
660 * or to the zone size (zone finish case). This aborts all plugged BIOs, which
661 * is fine to do as doing a zone reset or zone finish while writes are in-flight
662 * is a mistake from the user which will most likely cause all plugged BIOs to
671 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_set_wp_offset()
677 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { in disk_zone_wplug_set_wp_offset()
678 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_set_wp_offset()
682 /* Update the zone write pointer and abort all plugged BIOs. */ in disk_zone_wplug_set_wp_offset()
683 zwplug->wp_offset = wp_offset; in disk_zone_wplug_set_wp_offset()
687 * Updating the write pointer offset puts back the zone in disk_zone_wplug_set_wp_offset()
694 * The zone write plug now has no BIO plugged: remove it from the in disk_zone_wplug_set_wp_offset()
701 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_set_wp_offset()
707 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_or_finish()
708 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_reset_or_finish()
718 * If we have a zone write plug, set its write pointer offset to 0 in blk_zone_wplug_handle_reset_or_finish()
719 * (reset case) or to the zone size (finish case). This will abort all in blk_zone_wplug_handle_reset_or_finish()
721 * finishing zones while writes are still in-flight will result in the in blk_zone_wplug_handle_reset_or_finish()
735 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_all()
740 * Set the write pointer offset of all zone write plugs to 0. This will in blk_zone_wplug_handle_reset_all()
742 * are still in-flight will result in the writes failing anyway. in blk_zone_wplug_handle_reset_all()
745 sector += disk->queue->limits.chunk_sectors) { in blk_zone_wplug_handle_reset_all()
761 * This reference will be reused to submit a request for the BIO for in blk_zone_wplug_add_bio()
762 * blk-mq devices and dropped when the BIO is failed and after in blk_zone_wplug_add_bio()
763 * it is issued in the case of BIO-based devices. in blk_zone_wplug_add_bio()
765 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); in blk_zone_wplug_add_bio()
768 * The BIO is being plugged and thus will have to wait for the on-going in blk_zone_wplug_add_bio()
769 * write and for all other writes already plugged. So polling makes in blk_zone_wplug_add_bio()
775 * Reuse the poll cookie field to store the number of segments when in blk_zone_wplug_add_bio()
776 * split to the hardware limits. in blk_zone_wplug_add_bio()
778 bio->__bi_nr_segments = nr_segs; in blk_zone_wplug_add_bio()
781 * We always receive BIOs after they are split and ready to be issued. in blk_zone_wplug_add_bio()
783 * user must also issue write sequentially. So simply add the new BIO in blk_zone_wplug_add_bio()
784 * at the tail of the list to preserve the sequential write order. in blk_zone_wplug_add_bio()
786 bio_list_add(&zwplug->bio_list, bio); in blk_zone_wplug_add_bio()
799 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). in blk_zone_write_plug_bio_merged()
800 * For this case, we already hold a reference on the zone write plug for in blk_zone_write_plug_bio_merged()
802 * zone write pointer offset update. in blk_zone_write_plug_bio_merged()
810 * Get a reference on the zone write plug of the target zone and advance in blk_zone_write_plug_bio_merged()
811 * the zone write pointer offset. Given that this is a merge, we already in blk_zone_write_plug_bio_merged()
812 * have at least one request and one BIO referencing the zone write in blk_zone_write_plug_bio_merged()
815 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, in blk_zone_write_plug_bio_merged()
816 bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_merged()
820 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
821 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_bio_merged()
822 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
826 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
827 * already went through zone write plugging (either a new BIO or one that was
833 struct request_queue *q = req->q; in blk_zone_write_plug_init_request()
834 struct gendisk *disk = q->disk; in blk_zone_write_plug_init_request()
844 * Indicate that completion of this request needs to be handled with in blk_zone_write_plug_init_request()
846 * on the zone write plug we took above on entry to this function. in blk_zone_write_plug_init_request()
848 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_init_request()
854 * Walk through the list of plugged BIOs to check if they can be merged in blk_zone_write_plug_init_request()
857 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
859 bio = bio_list_peek(&zwplug->bio_list); in blk_zone_write_plug_init_request()
863 if (bio->bi_iter.bi_sector != req_back_sector || in blk_zone_write_plug_init_request()
868 !bio->__bi_nr_segments); in blk_zone_write_plug_init_request()
870 bio_list_pop(&zwplug->bio_list); in blk_zone_write_plug_init_request()
871 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != in blk_zone_write_plug_init_request()
873 bio_list_add_head(&zwplug->bio_list, bio); in blk_zone_write_plug_init_request()
879 * plugging the BIO and advance the write pointer offset. in blk_zone_write_plug_init_request()
882 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_init_request()
886 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
890 * Check and prepare a BIO for submission by incrementing the write pointer
891 * offset of its zone write plug and changing zone append operations into
892 * regular write when zone append emulation is needed.
897 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_prepare_bio()
900 * Check that the user is not attempting to write to a full zone. in blk_zone_wplug_prepare_bio()
902 * write pointer offset beyond the end of the zone. in blk_zone_wplug_prepare_bio()
909 * Use a regular write starting at the current write pointer. in blk_zone_wplug_prepare_bio()
910 * Similarly to native zone append operations, do not allow in blk_zone_wplug_prepare_bio()
913 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_wplug_prepare_bio()
914 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; in blk_zone_wplug_prepare_bio()
915 bio->bi_iter.bi_sector += zwplug->wp_offset; in blk_zone_wplug_prepare_bio()
924 * Check for non-sequential writes early because we avoid a in blk_zone_wplug_prepare_bio()
926 * to the driver. in blk_zone_wplug_prepare_bio()
928 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) in blk_zone_wplug_prepare_bio()
932 /* Advance the zone write pointer offset. */ in blk_zone_wplug_prepare_bio()
933 zwplug->wp_offset += bio_sectors(bio); in blk_zone_wplug_prepare_bio()
938 /* We detected an invalid write BIO: schedule error recovery. */ in blk_zone_wplug_prepare_bio()
940 kblockd_schedule_work(&disk->zone_wplugs_work); in blk_zone_wplug_prepare_bio()
946 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_write()
947 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_write()
954 * zone write plug for the entire BIO. For blk-mq devices, the block in blk_zone_wplug_handle_write()
955 * layer should already have done any splitting required to ensure this in blk_zone_wplug_handle_write()
957 * BIO-based devices, it is the responsibility of the driver to split in blk_zone_wplug_handle_write()
965 /* Conventional zones do not need write plugging. */ in blk_zone_wplug_handle_write()
967 /* Zone append to conventional zones is not allowed. */ in blk_zone_wplug_handle_write()
975 if (bio->bi_opf & REQ_NOWAIT) in blk_zone_wplug_handle_write()
984 /* Indicate that this BIO is being handled using zone write plugging. */ in blk_zone_wplug_handle_write()
989 * to the plug BIO list. Otherwise, plug and let the BIO execute. in blk_zone_wplug_handle_write()
991 if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) in blk_zone_wplug_handle_write()
995 * If an error is detected when preparing the BIO, add it to the BIO in blk_zone_wplug_handle_write()
1001 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_handle_write()
1003 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1008 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_handle_write()
1011 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1017 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1021 * Handle write, write zeroes and zone append operations requiring emulation
1022 * using zone write plugging.
1024 * Return true whenever @bio execution needs to be delayed through the zone
1025 * write plug. Otherwise, return false to let the submission path process
1030 struct block_device *bdev = bio->bi_bdev; in blk_zone_plug_bio()
1032 if (!bdev->bd_disk->zone_wplugs_hash) in blk_zone_plug_bio()
1044 * We do not need to do anything special for empty flush BIOs, e.g in blk_zone_plug_bio()
1046 * the responsibility of the user to first wait for the completion of in blk_zone_plug_bio()
1047 * write operations for flush to have any effect on the persistence of in blk_zone_plug_bio()
1050 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) in blk_zone_plug_bio()
1054 * Regular writes and write zeroes need to be handled through the target in blk_zone_plug_bio()
1055 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH in blk_zone_plug_bio()
1056 * which may need to go through the flush machinery depending on the in blk_zone_plug_bio()
1060 * completion, which will handle zone write plugging. in blk_zone_plug_bio()
1063 * write BIOs. in blk_zone_plug_bio()
1065 * to correctly track the write pointer offset of zones. These commands in blk_zone_plug_bio()
1066 * are not plugged as we do not need serialization with write in blk_zone_plug_bio()
1067 * operations. It is the responsibility of the user to not issue reset in blk_zone_plug_bio()
1068 * and finish commands when write operations are in flight. in blk_zone_plug_bio()
1097 * Take a reference on the zone write plug and schedule the submission in disk_zone_wplug_schedule_bio_work()
1101 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); in disk_zone_wplug_schedule_bio_work()
1102 atomic_inc(&zwplug->ref); in disk_zone_wplug_schedule_bio_work()
1103 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); in disk_zone_wplug_schedule_bio_work()
1111 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1117 if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { in disk_zone_wplug_unplug_bio()
1118 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1119 kblockd_schedule_work(&disk->zone_wplugs_work); in disk_zone_wplug_unplug_bio()
1124 if (!bio_list_empty(&zwplug->bio_list)) { in disk_zone_wplug_unplug_bio()
1126 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1130 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_unplug_bio()
1134 * (it was reset), remove its zone write plug from the hash table. in disk_zone_wplug_unplug_bio()
1139 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1144 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_write_plug_bio_endio()
1146 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_endio()
1156 * If this is a regular write emulating a zone append operation, in blk_zone_write_plug_bio_endio()
1160 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_write_plug_bio_endio()
1161 bio->bi_opf |= REQ_OP_ZONE_APPEND; in blk_zone_write_plug_bio_endio()
1165 * If the BIO failed, mark the plug as having an error to trigger in blk_zone_write_plug_bio_endio()
1168 if (bio->bi_status != BLK_STS_OK) { in blk_zone_write_plug_bio_endio()
1169 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1171 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1178 * For BIO-based devices, blk_zone_write_plug_finish_request() in blk_zone_write_plug_bio_endio()
1179 * is not called. So we need to schedule execution of the next in blk_zone_write_plug_bio_endio()
1182 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) in blk_zone_write_plug_bio_endio()
1191 struct gendisk *disk = req->q->disk; in blk_zone_write_plug_finish_request()
1194 zwplug = disk_get_zone_wplug(disk, req->__sector); in blk_zone_write_plug_finish_request()
1198 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_finish_request()
1224 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1226 bio = bio_list_pop(&zwplug->bio_list); in blk_zone_wplug_bio_work()
1228 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_bio_work()
1229 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1234 /* Error recovery will decide what to do with the BIO. */ in blk_zone_wplug_bio_work()
1235 bio_list_add_head(&zwplug->bio_list, bio); in blk_zone_wplug_bio_work()
1236 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1240 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1242 bdev = bio->bi_bdev; in blk_zone_wplug_bio_work()
1246 * blk-mq devices will reuse the extra reference on the request queue in blk_zone_wplug_bio_work()
1248 * path for BIO-based devices will not do that. So drop this extra in blk_zone_wplug_bio_work()
1252 blk_queue_exit(bdev->bd_disk->queue); in blk_zone_wplug_bio_work()
1261 switch (zone->cond) { in blk_zone_wp_offset()
1265 return zone->wp - zone->start; in blk_zone_wp_offset()
1267 return zone->len; in blk_zone_wp_offset()
1275 * Conventional, offline and read-only zones do not have a valid in blk_zone_wp_offset()
1276 * write pointer. in blk_zone_wp_offset()
1295 bdev_zone_sectors(disk->part0) * zwplug->zone_no; in disk_zone_wplug_handle_error()
1303 ret = disk->fops->report_zones(disk, zone_start_sector, 1, in disk_zone_wplug_handle_error()
1307 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_handle_error()
1311 * case, do nothing as the report zones may have seen the "old" write in disk_zone_wplug_handle_error()
1314 if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) in disk_zone_wplug_handle_error()
1317 zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; in disk_zone_wplug_handle_error()
1321 * We failed to get the zone information, meaning that something in disk_zone_wplug_handle_error()
1324 * plugged BIOs to complete if there is a queue freeze on-going. in disk_zone_wplug_handle_error()
1330 /* Update the zone write pointer offset. */ in disk_zone_wplug_handle_error()
1331 zwplug->wp_offset = blk_zone_wp_offset(&zone); in disk_zone_wplug_handle_error()
1335 if (!bio_list_empty(&zwplug->bio_list)) { in disk_zone_wplug_handle_error()
1341 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_handle_error()
1346 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_handle_error()
1356 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_zone_wplugs_work()
1358 while (!list_empty(&disk->zone_wplugs_err_list)) { in disk_zone_wplugs_work()
1359 zwplug = list_first_entry(&disk->zone_wplugs_err_list, in disk_zone_wplugs_work()
1361 list_del_init(&zwplug->link); in disk_zone_wplugs_work()
1362 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_zone_wplugs_work()
1367 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_zone_wplugs_work()
1370 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_zone_wplugs_work()
1375 return 1U << disk->zone_wplugs_hash_bits; in disk_zone_wplugs_hash_size()
1380 spin_lock_init(&disk->zone_wplugs_lock); in disk_init_zone_resources()
1381 INIT_LIST_HEAD(&disk->zone_wplugs_err_list); in disk_init_zone_resources()
1382 INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); in disk_init_zone_resources()
1386 * For the size of a disk zone write plug hash table, use the size of the
1387 * zone write plug mempool, which is the maximum of the disk open zones and
1389 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1399 disk->zone_wplugs_hash_bits = in disk_alloc_zone_resources()
1402 disk->zone_wplugs_hash = in disk_alloc_zone_resources()
1405 if (!disk->zone_wplugs_hash) in disk_alloc_zone_resources()
1406 return -ENOMEM; in disk_alloc_zone_resources()
1409 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); in disk_alloc_zone_resources()
1411 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, in disk_alloc_zone_resources()
1413 if (!disk->zone_wplugs_pool) in disk_alloc_zone_resources()
1416 disk->zone_wplugs_wq = in disk_alloc_zone_resources()
1418 pool_size, disk->disk_name); in disk_alloc_zone_resources()
1419 if (!disk->zone_wplugs_wq) in disk_alloc_zone_resources()
1425 mempool_destroy(disk->zone_wplugs_pool); in disk_alloc_zone_resources()
1426 disk->zone_wplugs_pool = NULL; in disk_alloc_zone_resources()
1428 kfree(disk->zone_wplugs_hash); in disk_alloc_zone_resources()
1429 disk->zone_wplugs_hash = NULL; in disk_alloc_zone_resources()
1430 disk->zone_wplugs_hash_bits = 0; in disk_alloc_zone_resources()
1431 return -ENOMEM; in disk_alloc_zone_resources()
1439 if (!disk->zone_wplugs_hash) in disk_destroy_zone_wplugs_hash_table()
1442 /* Free all the zone write plugs we have. */ in disk_destroy_zone_wplugs_hash_table()
1444 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { in disk_destroy_zone_wplugs_hash_table()
1445 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, in disk_destroy_zone_wplugs_hash_table()
1447 atomic_inc(&zwplug->ref); in disk_destroy_zone_wplugs_hash_table()
1453 kfree(disk->zone_wplugs_hash); in disk_destroy_zone_wplugs_hash_table()
1454 disk->zone_wplugs_hash = NULL; in disk_destroy_zone_wplugs_hash_table()
1455 disk->zone_wplugs_hash_bits = 0; in disk_destroy_zone_wplugs_hash_table()
1460 if (!disk->zone_wplugs_pool) in disk_free_zone_resources()
1463 cancel_work_sync(&disk->zone_wplugs_work); in disk_free_zone_resources()
1465 if (disk->zone_wplugs_wq) { in disk_free_zone_resources()
1466 destroy_workqueue(disk->zone_wplugs_wq); in disk_free_zone_resources()
1467 disk->zone_wplugs_wq = NULL; in disk_free_zone_resources()
1473 * Wait for the zone write plugs to be RCU-freed before in disk_free_zone_resources()
1478 mempool_destroy(disk->zone_wplugs_pool); in disk_free_zone_resources()
1479 disk->zone_wplugs_pool = NULL; in disk_free_zone_resources()
1481 bitmap_free(disk->conv_zones_bitmap); in disk_free_zone_resources()
1482 disk->conv_zones_bitmap = NULL; in disk_free_zone_resources()
1483 disk->zone_capacity = 0; in disk_free_zone_resources()
1484 disk->last_zone_capacity = 0; in disk_free_zone_resources()
1485 disk->nr_zones = 0; in disk_free_zone_resources()
1492 * can automatically handle write BIO plugging. BIO-based device drivers in disk_need_zone_resources()
1493 * (e.g. DM devices) are normally responsible for handling zone write in disk_need_zone_resources()
1497 return queue_is_mq(disk->queue) || in disk_need_zone_resources()
1498 queue_emulates_zone_append(disk->queue); in disk_need_zone_resources()
1504 struct queue_limits *lim = &disk->queue->limits; in disk_revalidate_zone_resources()
1514 pool_size = max(lim->max_open_zones, lim->max_active_zones); in disk_revalidate_zone_resources()
1518 if (!disk->zone_wplugs_hash) in disk_revalidate_zone_resources()
1540 struct request_queue *q = disk->queue; in disk_update_zone_resources()
1545 disk->nr_zones = args->nr_zones; in disk_update_zone_resources()
1546 disk->zone_capacity = args->zone_capacity; in disk_update_zone_resources()
1547 disk->last_zone_capacity = args->last_zone_capacity; in disk_update_zone_resources()
1548 swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); in disk_update_zone_resources()
1549 if (disk->conv_zones_bitmap) in disk_update_zone_resources()
1550 nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, in disk_update_zone_resources()
1551 disk->nr_zones); in disk_update_zone_resources()
1552 if (nr_conv_zones >= disk->nr_zones) { in disk_update_zone_resources()
1554 disk->disk_name, nr_conv_zones, disk->nr_zones); in disk_update_zone_resources()
1555 return -ENODEV; in disk_update_zone_resources()
1566 nr_seq_zones = disk->nr_zones - nr_conv_zones; in disk_update_zone_resources()
1572 if (!disk->zone_wplugs_pool) in disk_update_zone_resources()
1577 * zones, set its max open zone limit to the mempool size to indicate in disk_update_zone_resources()
1578 * to the user that there is a potential performance impact due to in disk_update_zone_resources()
1579 * dynamic zone write plug allocation when simultaneously writing to in disk_update_zone_resources()
1586 mempool_resize(disk->zone_wplugs_pool, pool_size); in disk_update_zone_resources()
1602 struct gendisk *disk = args->disk; in blk_revalidate_conv_zone()
1604 if (zone->capacity != zone->len) { in blk_revalidate_conv_zone()
1606 disk->disk_name); in blk_revalidate_conv_zone()
1607 return -ENODEV; in blk_revalidate_conv_zone()
1611 args->last_zone_capacity = zone->capacity; in blk_revalidate_conv_zone()
1616 if (!args->conv_zones_bitmap) { in blk_revalidate_conv_zone()
1617 args->conv_zones_bitmap = in blk_revalidate_conv_zone()
1618 bitmap_zalloc(args->nr_zones, GFP_NOIO); in blk_revalidate_conv_zone()
1619 if (!args->conv_zones_bitmap) in blk_revalidate_conv_zone()
1620 return -ENOMEM; in blk_revalidate_conv_zone()
1623 set_bit(idx, args->conv_zones_bitmap); in blk_revalidate_conv_zone()
1631 struct gendisk *disk = args->disk; in blk_revalidate_seq_zone()
1641 if (!args->zone_capacity) in blk_revalidate_seq_zone()
1642 args->zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1644 args->last_zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1645 } else if (zone->capacity != args->zone_capacity) { in blk_revalidate_seq_zone()
1647 disk->disk_name); in blk_revalidate_seq_zone()
1648 return -ENODEV; in blk_revalidate_seq_zone()
1652 * We need to track the write pointer of all zones that are not in blk_revalidate_seq_zone()
1653 * empty nor full. So make sure we have a zone write plug for in blk_revalidate_seq_zone()
1654 * such zone if the device has a zone write plug hash table. in blk_revalidate_seq_zone()
1656 if (!disk->zone_wplugs_hash) in blk_revalidate_seq_zone()
1660 if (!wp_offset || wp_offset >= zone->capacity) in blk_revalidate_seq_zone()
1663 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); in blk_revalidate_seq_zone()
1665 return -ENOMEM; in blk_revalidate_seq_zone()
1666 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_revalidate_seq_zone()
1673 * Helper function to check the validity of zones of a zoned block device.
1679 struct gendisk *disk = args->disk; in blk_revalidate_zone_cb()
1680 sector_t zone_sectors = disk->queue->limits.chunk_sectors; in blk_revalidate_zone_cb()
1684 if (zone->start != args->sector) { in blk_revalidate_zone_cb()
1686 disk->disk_name, args->sector, zone->start); in blk_revalidate_zone_cb()
1687 return -ENODEV; in blk_revalidate_zone_cb()
1690 if (zone->start >= get_capacity(disk) || !zone->len) { in blk_revalidate_zone_cb()
1692 disk->disk_name, zone->start, zone->len); in blk_revalidate_zone_cb()
1693 return -ENODEV; in blk_revalidate_zone_cb()
1701 if (zone->len != zone_sectors) { in blk_revalidate_zone_cb()
1703 disk->disk_name); in blk_revalidate_zone_cb()
1704 return -ENODEV; in blk_revalidate_zone_cb()
1706 } else if (zone->len > zone_sectors) { in blk_revalidate_zone_cb()
1708 disk->disk_name); in blk_revalidate_zone_cb()
1709 return -ENODEV; in blk_revalidate_zone_cb()
1712 if (!zone->capacity || zone->capacity > zone->len) { in blk_revalidate_zone_cb()
1714 disk->disk_name); in blk_revalidate_zone_cb()
1715 return -ENODEV; in blk_revalidate_zone_cb()
1719 switch (zone->type) { in blk_revalidate_zone_cb()
1729 disk->disk_name, (int)zone->type, zone->start); in blk_revalidate_zone_cb()
1730 ret = -ENODEV; in blk_revalidate_zone_cb()
1734 args->sector += zone->len; in blk_revalidate_zone_cb()
1740 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1743 * Helper function for low-level device drivers to check, (re) allocate and
1745 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1754 struct request_queue *q = disk->queue; in blk_revalidate_disk_zones()
1755 sector_t zone_sectors = q->limits.chunk_sectors; in blk_revalidate_disk_zones()
1759 int ret = -ENOMEM; in blk_revalidate_disk_zones()
1762 return -EIO; in blk_revalidate_disk_zones()
1765 return -ENODEV; in blk_revalidate_disk_zones()
1773 disk->disk_name, zone_sectors); in blk_revalidate_disk_zones()
1774 return -ENODEV; in blk_revalidate_disk_zones()
1779 disk->disk_name); in blk_revalidate_disk_zones()
1780 return -ENODEV; in blk_revalidate_disk_zones()
1788 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); in blk_revalidate_disk_zones()
1795 ret = disk->fops->report_zones(disk, 0, UINT_MAX, in blk_revalidate_disk_zones()
1798 pr_warn("%s: No zones reported\n", disk->disk_name); in blk_revalidate_disk_zones()
1799 ret = -ENODEV; in blk_revalidate_disk_zones()
1809 disk->disk_name, args.sector); in blk_revalidate_disk_zones()
1810 ret = -ENODEV; in blk_revalidate_disk_zones()
1821 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); in blk_revalidate_disk_zones()
1837 struct gendisk *disk = q->disk; in queue_zone_wplugs_show()
1844 if (!disk->zone_wplugs_hash) in queue_zone_wplugs_show()
1850 &disk->zone_wplugs_hash[i], node) { in queue_zone_wplugs_show()
1851 spin_lock_irqsave(&zwplug->lock, flags); in queue_zone_wplugs_show()
1852 zwp_zone_no = zwplug->zone_no; in queue_zone_wplugs_show()
1853 zwp_flags = zwplug->flags; in queue_zone_wplugs_show()
1854 zwp_ref = atomic_read(&zwplug->ref); in queue_zone_wplugs_show()
1855 zwp_wp_offset = zwplug->wp_offset; in queue_zone_wplugs_show()
1856 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); in queue_zone_wplugs_show()
1857 spin_unlock_irqrestore(&zwplug->lock, flags); in queue_zone_wplugs_show()