aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2024-05-02 11:56:13 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2024-05-02 11:56:14 +1000
commitce38cb5a2ad4c3b2ba5e7dbc4e792917ade6e9e5 (patch)
treefc61701008a8a2b2bd34222bbae06ad616f49700
parent68ecd84391581e9d2a9627bc34aac7b4df65b899 (diff)
parenta9af3696fa179262c1e5305a94b7afc7d93114b1 (diff)
downloadlinux-next-history-ce38cb5a2ad4c3b2ba5e7dbc4e792917ade6e9e5.tar.gz
Merge branch 'for-next' of git://git.kernel.dk/linux-block.git
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--block/Kconfig5
-rw-r--r--block/Makefile1
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-cgroup-rwstat.c18
-rw-r--r--block/blk-cgroup.c3
-rw-r--r--block/blk-core.c11
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-merge.c23
-rw-r--r--block/blk-mq-debugfs-zoned.c22
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-debugfs.h6
-rw-r--r--block/blk-mq.c180
-rw-r--r--block/blk-mq.h31
-rw-r--r--block/blk-settings.c46
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-throttle.c51
-rw-r--r--block/blk-zoned.c1508
-rw-r--r--block/blk.h84
-rw-r--r--block/elevator.c46
-rw-r--r--block/elevator.h1
-rw-r--r--block/fops.c29
-rw-r--r--block/genhd.c3
-rw-r--r--block/mq-deadline.c204
-rw-r--r--drivers/block/brd.c40
-rw-r--r--drivers/block/null_blk/main.c42
-rw-r--r--drivers/block/null_blk/null_blk.h2
-rw-r--r--drivers/block/null_blk/zoned.c358
-rw-r--r--drivers/block/ublk_drv.c5
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/md/dm-bio-prison-v2.c3
-rw-r--r--drivers/md/dm-cache-target.c12
-rw-r--r--drivers/md/dm-clone-target.c14
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-era-target.c3
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-thin.c12
-rw-r--r--drivers/md/dm-vdo/data-vio.c3
-rw-r--r--drivers/md/dm-vdo/flush.c3
-rw-r--r--drivers/md/dm-zone.c501
-rw-r--r--drivers/md/dm.c72
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/md.c14
-rw-r--r--drivers/md/md.h5
-rw-r--r--drivers/md/raid5.c15
-rw-r--r--drivers/nvme/host/core.c2
-rw-r--r--drivers/nvme/host/ioctl.c15
-rw-r--r--drivers/nvme/target/zns.c10
-rw-r--r--drivers/scsi/scsi_lib.c1
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/sd.h19
-rw-r--r--drivers/scsi/sd_zbc.c335
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--include/linux/bio.h7
-rw-r--r--include/linux/blk-mq.h85
-rw-r--r--include/linux/blk_types.h30
-rw-r--r--include/linux/blkdev.h105
-rw-r--r--include/linux/io_uring.h6
-rw-r--r--include/linux/io_uring/cmd.h24
-rw-r--r--include/linux/io_uring/net.h18
-rw-r--r--include/linux/io_uring_types.h19
-rw-r--r--include/uapi/linux/io_uring.h28
-rw-r--r--io_uring/Makefile15
-rw-r--r--io_uring/alloc_cache.h59
-rw-r--r--io_uring/cancel.c4
-rw-r--r--io_uring/fdinfo.c4
-rw-r--r--io_uring/futex.c30
-rw-r--r--io_uring/futex.h5
-rw-r--r--io_uring/io-wq.c20
-rw-r--r--io_uring/io_uring.c665
-rw-r--r--io_uring/io_uring.h33
-rw-r--r--io_uring/kbuf.c318
-rw-r--r--io_uring/kbuf.h64
-rw-r--r--io_uring/memmap.c336
-rw-r--r--io_uring/memmap.h25
-rw-r--r--io_uring/msg_ring.c12
-rw-r--r--io_uring/net.c830
-rw-r--r--io_uring/net.h29
-rw-r--r--io_uring/notif.c112
-rw-r--r--io_uring/notif.h13
-rw-r--r--io_uring/opdef.c65
-rw-r--r--io_uring/opdef.h9
-rw-r--r--io_uring/poll.c15
-rw-r--r--io_uring/poll.h9
-rw-r--r--io_uring/refs.h7
-rw-r--r--io_uring/register.c3
-rw-r--r--io_uring/rsrc.c47
-rw-r--r--io_uring/rsrc.h13
-rw-r--r--io_uring/rw.c585
-rw-r--r--io_uring/rw.h25
-rw-r--r--io_uring/sqpoll.c8
-rw-r--r--io_uring/timeout.c9
-rw-r--r--io_uring/uring_cmd.c122
-rw-r--r--io_uring/uring_cmd.h8
-rw-r--r--io_uring/waitid.c2
-rw-r--r--lib/sbitmap.c8
-rw-r--r--mm/nommu.c7
-rw-r--r--net/socket.c2
98 files changed, 4111 insertions, 3519 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48ccbd..d47398ae98243d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
- select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -198,10 +197,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
-config BLK_DEBUG_FS_ZONED
- bool
- default BLK_DEBUG_FS && BLK_DEV_ZONED
-
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbfe2..168150b9c51025 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
diff --git a/block/bio.c b/block/bio.c
index 19ba3aeacdbbeb..3f3977c6999718 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1577,6 +1577,8 @@ again:
if (!bio_integrity_endio(bio))
return;
+ blk_zone_bio_endio(bio);
+
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 3304e841df7ce9..a55fb0c535582b 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
int i, ret;
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
- if (ret) {
- while (--i >= 0)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
- return ret;
- }
+ ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
atomic64_set(&rwstat->aux_cnt[i], 0);
- }
return 0;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_init);
void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
}
EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 059467086b1312..26c5bf9bc22a3d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
- bio_list_merge(&bios, &blkg->async_bios);
- bio_list_init(&blkg->async_bios);
+ bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
diff --git a/block/blk-core.c b/block/blk-core.c
index b795ac177281ad..ba4e0cba12e131 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
@@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > q->limits.max_zone_append_sectors)
+ if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
@@ -910,12 +909,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc1493..c17cf8ed8113db 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
+ if (rq->bio)
+ rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b7575..f64115d72f3d43 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
@@ -972,13 +973,7 @@ static void blk_account_io_merge_bio(struct request *req)
part_stat_unlock();
}
-enum bio_merge_status {
- BIO_MERGE_OK,
- BIO_MERGE_NONE,
- BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +989,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_bio_merged(bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +1007,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
+ /*
+ * A front merge for writes to sequential zones of a zoned block device
+ * can happen only if the user submitted writes out of order. Do not
+ * merge such write to let it fail.
+ */
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ return BIO_MERGE_FAILED;
+
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1107,10 +1113,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct blk_plug *plug;
+ struct blk_plug *plug = current->plug;
struct request *rq;
- plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7a6..00000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->disk->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < q->disk->nr_zones; i++)
- if (test_bit(i, q->disk->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09bf..770c0c2b72faaa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+ { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
- RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d41e..c80e453e301453 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0ef..9f677ea85a52df 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
#include <trace/events/block.h>
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_zone_finish_request(rq);
+
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq);
/*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -1330,11 +1329,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1921,19 +1915,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2000,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2061,23 +2041,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2164,6 +2132,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
}
/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->next_cpu >= nr_cpu_ids;
+}
+
+/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
@@ -2174,7 +2151,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
bool tried = false;
int next_cpu = hctx->next_cpu;
- if (hctx->queue->nr_hw_queues == 1)
+ /* Switch to unbound if no allowable CPUs in this hctx */
+ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
@@ -2948,22 +2926,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
+
bio = blk_queue_bounce(bio, q);
/*
- * If the plug has a cached request for this queue, try use it.
- *
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
- rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
@@ -2980,6 +2973,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
+ if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+
+new_request:
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
@@ -3002,6 +2999,9 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_init_request(rq);
+
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
@@ -3483,14 +3483,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
return data.has_rq;
}
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
- struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+ unsigned int this_cpu)
{
- if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
- return false;
- if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
- return false;
- return true;
+ enum hctx_type type = hctx->type;
+ int cpu;
+
+ /*
+ * hctx->cpumask has to rule out isolated CPUs, but userspace still
+ * might submit IOs on these isolated CPUs, so use the queue map to
+ * check if all CPUs mapped to this hctx are offline
+ */
+ for_each_online_cpu(cpu) {
+ struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+ type, cpu);
+
+ if (h != hctx)
+ continue;
+
+ /* this hctx has at least one online CPU */
+ if (this_cpu != cpu)
+ return true;
+ }
+
+ return false;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3498,8 +3514,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
- !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ if (blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;
/*
@@ -3907,6 +3922,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
queue_for_each_hw_ctx(q, hctx, i) {
+ int cpu;
+
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -3934,6 +3951,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
+ * Rule out isolated CPUs from hctx->cpumask to avoid
+ * running block kworker on isolated CPUs
+ */
+ for_each_cpu(cpu, hctx->cpumask) {
+ if (cpu_is_isolated(cpu))
+ cpumask_clear_cpu(cpu, hctx->cpumask);
+ }
+
+ /*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde1b..260beea8e332ce 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
- /* Zoned block device write operation case: do not plug the BIO */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
- return NULL;
-
- /*
- * For regular block devices or read operations, use the context plug
- * which may be NULL if blk_start_plug() was not executed.
- */
- return current->plug;
-}
-
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d2731843f2fccb..8e1d7ed52fef36 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -408,24 +408,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
- unsigned int max_sectors;
+ unsigned int max_sectors = 0;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
- max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- max_sectors = min(q->limits.chunk_sectors, max_sectors);
+ if (max_zone_append_sectors) {
+ max_sectors = min(q->limits.max_hw_sectors,
+ max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
- /*
- * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- * or the max_hw_sectors limit not set.
- */
- WARN_ON(!max_sectors);
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append
+ * sectors limit being 0 due to the chunk_sectors limit (zone
+ * size) not set or the max_hw_sectors limit not set.
+ */
+ WARN_ON_ONCE(!max_sectors);
+ }
q->limits.max_zone_append_sectors = max_sectors;
}
@@ -752,8 +760,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- b->max_zone_append_sectors);
+ t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+ queue_limits_max_zone_append_sectors(b));
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -1041,22 +1049,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q: the request queue for the target device
- * @features: Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features)
-{
- q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
-/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
* @dev: the device pointer for dma
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48ee..e3ed5a921affb2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
- unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+ unsigned long long max_sectors = queue_max_zone_append_sectors(q);
return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860bba..c515e1a96fad42 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1494,11 +1494,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
- char bufs[4][21] = { "max", "max", "max", "max" };
u64 bps_dft;
unsigned int iops_dft;
- char idle_time[26] = "";
- char latency_time[26] = "";
if (!dname)
return 0;
@@ -1520,35 +1517,39 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
tg->latency_target_conf == DFL_LATENCY_TARGET)))
return 0;
- if (tg->bps_conf[READ][off] != U64_MAX)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu",
- tg->bps_conf[READ][off]);
- if (tg->bps_conf[WRITE][off] != U64_MAX)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu",
- tg->bps_conf[WRITE][off]);
- if (tg->iops_conf[READ][off] != UINT_MAX)
- snprintf(bufs[2], sizeof(bufs[2]), "%u",
- tg->iops_conf[READ][off]);
- if (tg->iops_conf[WRITE][off] != UINT_MAX)
- snprintf(bufs[3], sizeof(bufs[3]), "%u",
- tg->iops_conf[WRITE][off]);
+ seq_printf(sf, "%s", dname);
+ if (tg->bps_conf[READ][off] == U64_MAX)
+ seq_printf(sf, " rbps=max");
+ else
+ seq_printf(sf, " rbps=%llu", tg->bps_conf[READ][off]);
+
+ if (tg->bps_conf[WRITE][off] == U64_MAX)
+ seq_printf(sf, " wbps=max");
+ else
+ seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE][off]);
+
+ if (tg->iops_conf[READ][off] == UINT_MAX)
+ seq_printf(sf, " riops=max");
+ else
+ seq_printf(sf, " riops=%u", tg->iops_conf[READ][off]);
+
+ if (tg->iops_conf[WRITE][off] == UINT_MAX)
+ seq_printf(sf, " wiops=max");
+ else
+ seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE][off]);
+
if (off == LIMIT_LOW) {
if (tg->idletime_threshold_conf == ULONG_MAX)
- strcpy(idle_time, " idle=max");
+ seq_printf(sf, " idle=max");
else
- snprintf(idle_time, sizeof(idle_time), " idle=%lu",
- tg->idletime_threshold_conf);
+ seq_printf(sf, " idle=%lu", tg->idletime_threshold_conf);
if (tg->latency_target_conf == ULONG_MAX)
- strcpy(latency_time, " latency=max");
+ seq_printf(sf, " latency=max");
else
- snprintf(latency_time, sizeof(latency_time),
- " latency=%lu", tg->latency_target_conf);
+ seq_printf(sf, " latency=%lu", tg->latency_target_conf);
}
-
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
- latency_time);
+ seq_printf(sf, "\n");
return 0;
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index b008bcd4889c45..57d367ada1f237 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,6 +7,7 @@
*
* Copyright (c) 2016, Damien Le Moal
* Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
*/
#include <linux/kernel.h>
@@ -16,8 +17,13 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
static const char *const zone_cond_name[] = {
@@ -32,6 +38,64 @@ static const char *const zone_cond_name[] = {
};
#undef ZONE_COND_NAME
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @link: To list the plug in the zone write plug error list of the disk.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ * always at least 1 when the plug is hashed in the disk plug hash table.
+ * The reference is incremented whenever a new BIO needing plugging is
+ * submitted and when a function needs to manipulate a plug. The
+ * reference count is decremented whenever a plugged BIO completes and
+ * when a function that referenced the plug returns. The initial
+ * reference is dropped whenever the zone of the zone write plug is reset,
+ * finished and when the zone becomes full (last write BIO to the zone
+ * completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ * as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+ struct hlist_node node;
+ struct list_head link;
+ atomic_t ref;
+ spinlock_t lock;
+ unsigned int flags;
+ unsigned int zone_no;
+ unsigned int wp_offset;
+ struct bio_list bio_list;
+ struct work_struct bio_work;
+ struct rcu_head rcu_head;
+ struct gendisk *disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ * that is, that write BIOs are being throttled due to a write BIO already
+ * being executed or the zone write plug bio list is not empty.
+ * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
+ * recovered with a report zone to update the zone write pointer offset.
+ * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ * from the disk hash table and that the initial reference to the zone
+ * write plug set when the plug was first added to the hash table has been
+ * dropped. This flag is set when a zone is reset, finished or become full,
+ * to prevent new references to the zone write plug to be taken for
+ * newly incoming BIOs. A zone write plug flagged with this flag will be
+ * freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
+#define BLK_ZONE_WPLUG_ERROR (1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
+
+#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
+
/**
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
* @zone_cond: BLK_ZONE_COND_XXX.
@@ -51,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- if (!rq->q->disk->seq_zones_wlock)
- return false;
-
- return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
- unsigned int zno = blk_rq_zone_no(rq);
-
- if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
- return false;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
- if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock)))
- return;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
-
-void __blk_req_zone_write_unlock(struct request *rq)
-{
- rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->disk->seq_zones_wlock)
- WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-
/**
* bdev_nr_zones - Get number of zones
* @bdev: Target device
@@ -425,23 +443,1288 @@ fail:
return ret;
}
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
+{
+ if (!disk->conv_zones_bitmap)
+ return false;
+ return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+}
+
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ struct blk_zone_wplug *zwplg;
+ unsigned long flags;
+ unsigned int idx =
+ hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
+
+ /*
+ * Add the new zone write plug to the hash table, but carefully as we
+ * are racing with other submission context, so we may already have a
+ * zone write plug for the same zone.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplg->zone_no == zwplug->zone_no) {
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ return false;
+ }
+ }
+ hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ return true;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+ struct blk_zone_wplug *zwplug;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplug->zone_no == zno &&
+ atomic_inc_not_zero(&zwplug->ref)) {
+ rcu_read_unlock();
+ return zwplug;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
{
+ struct blk_zone_wplug *zwplug =
+ container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+ mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+ if (atomic_dec_and_test(&zwplug->ref)) {
+ WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+ WARN_ON_ONCE(!list_empty(&zwplug->link));
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
+
+ call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+ }
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /* If the zone write plug was already removed, we are done. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return false;
+
+ /* If the zone write plug is still busy, it cannot be removed. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ return false;
+
+ /*
+ * Completions of BIOs with blk_zone_write_plug_bio_endio() may
+ * happen after handling a request completion with
+ * blk_zone_write_plug_finish_request() (e.g. with split BIOs
+ * that are chained). In such case, disk_zone_wplug_unplug_bio()
+ * should not attempt to remove the zone write plug until all BIO
+ * completions are seen. Check by looking at the zone write plug
+ * reference count, which is 2 when the plug is unused (one reference
+ * taken when the plug was allocated and another reference taken by the
+ * caller context).
+ */
+ if (atomic_read(&zwplug->ref) > 2)
+ return false;
+
+ /* We can remove zone write plugs for zones that are empty or full. */
+ return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ /* If the zone write plug was already removed, we have nothing to do. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return;
+
+ /*
+ * Mark the zone write plug as unhashed and drop the extra reference we
+ * took when the plug was inserted in the hash table.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_del_init_rcu(&zwplug->node);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+ sector_t sector, gfp_t gfp_mask,
+ unsigned long *flags)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ struct blk_zone_wplug *zwplug;
+
+again:
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ /*
+ * Check that a BIO completion or a zone reset or finish
+ * operation has not already removed the zone write plug from
+ * the hash table and dropped its reference count. In such case,
+ * we need to get a new plug so start over from the beginning.
+ */
+ spin_lock_irqsave(&zwplug->lock, *flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ disk_put_zone_wplug(zwplug);
+ goto again;
+ }
+ return zwplug;
+ }
+
+ /*
+ * Allocate and initialize a zone write plug with an extra reference
+ * so that it is not freed when the zone write plug becomes idle without
+ * the zone being full.
+ */
+ zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+ if (!zwplug)
+ return NULL;
+
+ INIT_HLIST_NODE(&zwplug->node);
+ INIT_LIST_HEAD(&zwplug->link);
+ atomic_set(&zwplug->ref, 2);
+ spin_lock_init(&zwplug->lock);
+ zwplug->flags = 0;
+ zwplug->zone_no = zno;
+ zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+ bio_list_init(&zwplug->bio_list);
+ INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+ zwplug->disk = disk;
+
+ spin_lock_irqsave(&zwplug->lock, *flags);
+
+ /*
+ * Insert the new zone write plug in the hash table. This can fail only
+ * if another context already inserted a plug. Retry from the beginning
+ * in such case.
+ */
+ if (!disk_insert_zone_wplug(disk, zwplug)) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ mempool_free(zwplug, disk->zone_wplugs_pool);
+ goto again;
+ }
+
+ return zwplug;
+}
+
+static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct request_queue *q = zwplug->disk->queue;
+
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+ bio_io_error(bio);
+ disk_put_zone_wplug(zwplug);
+ blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
+ */
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list)))
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
+ * with the assumed write pointer location of the zone when the BIO will
+ * be unplugged.
+ */
+static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned int zone_capacity = disk->zone_capacity;
+ unsigned int wp_offset = zwplug->wp_offset;
+ struct bio_list bl = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list))) {
+ if (wp_offset >= zone_capacity ||
+ (bio_op(bio) != REQ_OP_ZONE_APPEND &&
+ bio_offset_from_zone_start(bio) != wp_offset)) {
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+ continue;
+ }
+
+ wp_offset += bio_sectors(bio);
+ bio_list_add(&bl, bio);
+ }
+
+ bio_list_merge(&zwplug->bio_list, &bl);
+}
+
+static inline void disk_zone_wplug_set_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
+ return;
+
+ /*
+ * At this point, we already have a reference on the zone write plug.
+ * However, since we are going to add the plug to the disk zone write
+ * plugs work list, increase its reference count. This reference will
+ * be dropped in disk_zone_wplugs_work() once the error state is
+ * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
+ * finished.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
+ atomic_inc(&zwplug->ref);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+ return;
+
+ /*
+ * We are racing with the error handling work which drops the reference
+ * on the zone write plug after handling the error state. So remove the
+ * plug from the error list and drop its reference count only if the
+ * error handling has not yet started, that is, if the zone write plug
+ * is still listed.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ if (!list_empty(&zwplug->link)) {
+ list_del_init(&zwplug->link);
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+ disk_put_zone_wplug(zwplug);
+ }
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+/*
+ * Set a zone write plug write pointer offset to either 0 (zone reset case)
+ * or to the zone size (zone finish case). This aborts all plugged BIOs, which
+ * is fine to do as doing a zone reset or zone finish while writes are in-flight
+ * is a mistake from the user which will most likely cause all plugged BIOs to
+ * fail anyway.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug,
+ unsigned int wp_offset)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * Make sure that a BIO completion or another zone reset or finish
+ * operation has not already removed the plug from the hash table.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ /* Update the zone write pointer and abort all plugged BIOs. */
+ zwplug->wp_offset = wp_offset;
+ disk_zone_wplug_abort(zwplug);
+
+ /*
+ * Updating the write pointer offset puts back the zone
+ * in a good state. So clear the error flag and decrement the
+ * error count if we were in error state.
+ */
+ disk_zone_wplug_clear_error(disk, zwplug);
+
+ /*
+ * The zone write plug now has no BIO plugged: remove it from the
+ * hash table so that it cannot be seen. The plug will be freed
+ * when the last reference is dropped.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+ unsigned int wp_offset)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+
+ /* Conventional zones cannot be reset nor finished. */
+ if (disk_zone_is_conv(disk, sector)) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to 0
+ * (reset case) or to the zone size (finish case). This will abort all
+ * BIOs plugged for the target zone. It is fine as resetting or
+ * finishing zones while writes are still in-flight will result in the
+ * writes failing anyway.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+ disk_put_zone_wplug(zwplug);
+ }
+
+ return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ sector_t sector;
+
+ /*
+ * Set the write pointer offset of all zone write plugs to 0. This will
+ * abort all plugged BIOs. It is fine as resetting zones while writes
+ * are still in-flight will result in the writes failing anyway.
+ */
+ for (sector = 0; sector < get_capacity(disk);
+ sector += disk->queue->limits.chunk_sectors) {
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ return false;
+}
+
+static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio, unsigned int nr_segs)
+{
+ /*
+ * Grab an extra reference on the BIO request queue usage counter.
+ * This reference will be reused to submit a request for the BIO for
+ * blk-mq devices and dropped when the BIO is failed and after
+ * it is issued in the case of BIO-based devices.
+ */
+ percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+ /*
+ * The BIO is being plugged and thus will have to wait for the on-going
+ * write and for all other writes already plugged. So polling makes
+ * no sense.
+ */
+ bio_clear_polled(bio);
+
+ /*
+ * Reuse the poll cookie field to store the number of segments when
+ * split to the hardware limits.
+ */
+ bio->__bi_nr_segments = nr_segs;
+
+ /*
+ * We always receive BIOs after they are split and ready to be issued.
+ * The block layer passes the parts of a split BIO in order, and the
+ * user must also issue write sequentially. So simply add the new BIO
+ * at the tail of the list to preserve the sequential write order.
+ */
+ bio_list_add(&zwplug->bio_list, bio);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * If the BIO was already plugged, then we were called through
+ * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
+ * For this case, we already hold a reference on the zone write plug for
+ * the BIO and blk_zone_write_plug_init_request() will handle the
+ * zone write pointer offset update.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return;
+
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * Get a reference on the zone write plug of the target zone and advance
+ * the zone write pointer offset. Given that this is a merge, we already
+ * have at least one request and one BIO referencing the zone write
+ * plug. So this should not fail.
+ */
+ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+ bio->bi_iter.bi_sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwplug->wp_offset += bio_sectors(bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_init_request(struct request *req)
+{
+ sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+ struct request_queue *q = req->q;
+ struct gendisk *disk = q->disk;
+ unsigned int zone_capacity = disk->zone_capacity;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, blk_rq_pos(req));
+ unsigned long flags;
+ struct bio *bio;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /*
+ * Indicate that completion of this request needs to be handled with
+ * blk_zone_write_plug_finish_request(), which will drop the reference
+ * on the zone write plug we took above on entry to this function.
+ */
+ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+ if (blk_queue_nomerges(q))
+ return;
+
+ /*
+ * Walk through the list of plugged BIOs to check if they can be merged
+ * into the back of the request.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+ while (zwplug->wp_offset < zone_capacity) {
+ bio = bio_list_peek(&zwplug->bio_list);
+ if (!bio)
+ break;
+
+ if (bio->bi_iter.bi_sector != req_back_sector ||
+ !blk_rq_merge_ok(req, bio))
+ break;
+
+ WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ !bio->__bi_nr_segments);
+
+ bio_list_pop(&zwplug->bio_list);
+ if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+ BIO_MERGE_OK) {
+ bio_list_add_head(&zwplug->bio_list, bio);
+ break;
+ }
+
+ /*
+ * Drop the extra reference on the queue usage we got when
+ * plugging the BIO and advance the write pointer offset.
+ */
+ blk_queue_exit(q);
+ zwplug->wp_offset += bio_sectors(bio);
+
+ req_back_sector += bio_sectors(bio);
+ }
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ /*
+ * Check that the user is not attempting to write to a full zone.
+ * We know such BIO will fail, and that would potentially overflow our
+ * write pointer offset beyond the end of the zone.
+ */
+ if (zwplug->wp_offset >= disk->zone_capacity)
+ goto err;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Use a regular write starting at the current write pointer.
+ * Similarly to native zone append operations, do not allow
+ * merging.
+ */
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+ bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+ /*
+ * Remember that this BIO is in fact a zone append operation
+ * so that we can restore its operation code on completion.
+ */
+ bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+ } else {
+ /*
+ * Check for non-sequential writes early because we avoid a
+ * whole lot of error handling trouble if we don't send it off
+ * to the driver.
+ */
+ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+ goto err;
+ }
+
+ /* Advance the zone write pointer offset. */
+ zwplug->wp_offset += bio_sectors(bio);
+
+ return true;
+
+err:
+ /* We detected an invalid write BIO: schedule error recovery. */
+ disk_zone_wplug_set_error(disk, zwplug);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return false;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+ gfp_t gfp_mask = GFP_NOIO;
+ unsigned long flags;
+
+ /*
+ * BIOs must be fully contained within a zone so that we use the correct
+ * zone write plug for the entire BIO. For blk-mq devices, the block
+ * layer should already have done any splitting required to ensure this
+ * and this BIO should thus not be straddling zone boundaries. For
+ * BIO-based devices, it is the responsibility of the driver to split
+ * the bio before submitting it.
+ */
+ if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Conventional zones do not need write plugging. */
+ if (disk_zone_is_conv(disk, sector)) {
+ /* Zone append to conventional zones is not allowed. */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ bio_io_error(bio);
+ return true;
+ }
+ return false;
+ }
+
+ if (bio->bi_opf & REQ_NOWAIT)
+ gfp_mask = GFP_NOWAIT;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+ if (!zwplug) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Indicate that this BIO is being handled using zone write plugging. */
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If the zone is already plugged or has a pending error, add the BIO
+ * to the plug BIO list. Otherwise, plug and let the BIO execute.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ goto plug;
+
+ /*
+ * If an error is detected when preparing the BIO, add it to the BIO
+ * list so that error recovery can deal with it.
+ */
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio))
+ goto plug;
+
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return false;
+
+plug:
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+ blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return true;
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ struct block_device *bdev = bio->bi_bdev;
+
+ if (!bdev->bd_disk->zone_wplugs_hash)
+ return false;
+
+ /*
+ * If the BIO already has the plugging flag set, then it was already
+ * handled through this path and this is a submission from the zone
+ * plug bio submit work.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return false;
+
+ /*
+ * We do not need to do anything special for empty flush BIOs, e.g
+ * BIOs such as issued by blkdev_issue_flush(). The is because it is
+ * the responsibility of the user to first wait for the completion of
+ * write operations for flush to have any effect on the persistence of
+ * the written data.
+ */
+ if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ return false;
+
+ /*
+ * Regular writes and write zeroes need to be handled through the target
+ * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+ * which may need to go through the flush machinery depending on the
+ * target device capabilities. Plugging such writes is fine as the flush
+ * machinery operates at the request level, below the plug, and
+ * completion of the flush sequence will go through the regular BIO
+ * completion, which will handle zone write plugging.
+ * Zone append operations for devices that requested emulation must
+ * also be plugged so that these BIOs can be changed into regular
+ * write BIOs.
+ * Zone reset, reset all and finish commands need special treatment
+ * to correctly track the write pointer offset of zones. These commands
+ * are not plugged as we do not need serialization with write
+ * operations. It is the responsibility of the user to not issue reset
+ * and finish commands when write operations are in flight.
+ */
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ if (!bdev_emulates_zone_append(bdev))
+ return false;
+ fallthrough;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ return blk_zone_wplug_handle_write(bio, nr_segs);
+ case REQ_OP_ZONE_RESET:
+ return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+ case REQ_OP_ZONE_FINISH:
+ return blk_zone_wplug_handle_reset_or_finish(bio,
+ bdev_zone_sectors(bdev));
+ case REQ_OP_ZONE_RESET_ALL:
+ return blk_zone_wplug_handle_reset_all(bio);
+ default:
+ return false;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /*
+ * Take a reference on the zone write plug and schedule the submission
+ * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
+ * reference we take here.
+ */
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+ atomic_inc(&zwplug->ref);
+ queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+}
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * If we had an error, schedule error recovery. The recovery work
+ * will restart submission of plugged BIOs.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return;
+ }
+
+ /* Schedule submission of the next plugged BIO if we have one. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ /*
+ * If the zone is full (it was fully written or finished, or empty
+ * (it was reset), remove its zone write plug from the hash table.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /* Make sure we do not see this BIO again by clearing the plug flag. */
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If this is a regular write emulating a zone append operation,
+ * restore the original operation code.
+ */
+ if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+
+ /*
+ * If the BIO failed, mark the plug as having an error to trigger
+ * recovery.
+ */
+ if (bio->bi_status != BLK_STS_OK) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_error(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ }
+
+ /* Drop the reference we took when the BIO was issued. */
+ disk_put_zone_wplug(zwplug);
+
+ /*
+ * For BIO-based devices, blk_zone_write_plug_finish_request()
+ * is not called. So we need to schedule execution of the next
+ * plugged BIO here.
+ */
+ if (bio->bi_bdev->bd_has_submit_bio)
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_finish_request(struct request *req)
+{
+ struct gendisk *disk = req->q->disk;
+ struct blk_zone_wplug *zwplug;
+
+ zwplug = disk_get_zone_wplug(disk, req->__sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+ /*
+ * Drop the reference we took when the request was initialized in
+ * blk_zone_write_plug_init_request().
+ */
+ disk_put_zone_wplug(zwplug);
+
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(work, struct blk_zone_wplug, bio_work);
+ struct block_device *bdev;
+ unsigned long flags;
+ struct bio *bio;
+
+ /*
+ * Submit the next plugged BIO. If we do not have any, clear
+ * the plugged flag.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ bio = bio_list_pop(&zwplug->bio_list);
+ if (!bio) {
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ goto put_zwplug;
+ }
+
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ /* Error recovery will decide what to do with the BIO. */
+ bio_list_add_head(&zwplug->bio_list, bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ goto put_zwplug;
+ }
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ bdev = bio->bi_bdev;
+ submit_bio_noacct_nocheck(bio);
+
+ /*
+ * blk-mq devices will reuse the extra reference on the request queue
+ * usage counter we took when the BIO was plugged, but the submission
+ * path for BIO-based devices will not do that. So drop this extra
+ * reference here.
+ */
+ if (bdev->bd_has_submit_bio)
+ blk_queue_exit(bdev->bd_disk->queue);
+
+put_zwplug:
+ /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+ disk_put_zone_wplug(zwplug);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ return 0;
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Conventional, offline and read-only zones do not have a valid
+ * write pointer.
+ */
+ return UINT_MAX;
+ }
+}
+
+static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
+ unsigned int idx, void *data)
+{
+ struct blk_zone *zonep = data;
+
+ *zonep = *zone;
+ return 0;
+}
+
+static void disk_zone_wplug_handle_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ sector_t zone_start_sector =
+ bdev_zone_sectors(disk->part0) * zwplug->zone_no;
+ unsigned int noio_flag;
+ struct blk_zone zone;
+ unsigned long flags;
+ int ret;
+
+ /* Get the current zone information from the device. */
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, zone_start_sector, 1,
+ blk_zone_wplug_report_zone_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * A zone reset or finish may have cleared the error already. In such
+ * case, do nothing as the report zones may have seen the "old" write
+ * pointer value before the reset/finish operation completed.
+ */
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+ goto unlock;
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+
+ if (ret != 1) {
+ /*
+ * We failed to get the zone information, meaning that something
+ * is likely really wrong with the device. Abort all remaining
+ * plugged BIOs as otherwise we could endup waiting forever on
+ * plugged BIOs to complete if there is a queue freeze on-going.
+ */
+ disk_zone_wplug_abort(zwplug);
+ goto unplug;
+ }
+
+ /* Update the zone write pointer offset. */
+ zwplug->wp_offset = blk_zone_wp_offset(&zone);
+ disk_zone_wplug_abort_unaligned(disk, zwplug);
+
+ /* Restart BIO submission if we still have any BIO left. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ goto unlock;
+ }
+
+unplug:
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+unlock:
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static void disk_zone_wplugs_work(struct work_struct *work)
+{
+ struct gendisk *disk =
+ container_of(work, struct gendisk, zone_wplugs_work);
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+
+ while (!list_empty(&disk->zone_wplugs_err_list)) {
+ zwplug = list_first_entry(&disk->zone_wplugs_err_list,
+ struct blk_zone_wplug, link);
+ list_del_init(&zwplug->link);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ disk_zone_wplug_handle_error(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+ return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+ spin_lock_init(&disk->zone_wplugs_lock);
+ INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
+ INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+ unsigned int pool_size)
+{
+ unsigned int i;
+
+ disk->zone_wplugs_hash_bits =
+ min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+ disk->zone_wplugs_hash =
+ kcalloc(disk_zone_wplugs_hash_size(disk),
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!disk->zone_wplugs_hash)
+ return -ENOMEM;
+
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+ disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+ sizeof(struct blk_zone_wplug));
+ if (!disk->zone_wplugs_pool)
+ goto free_hash;
+
+ disk->zone_wplugs_wq =
+ alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ pool_size, disk->disk_name);
+ if (!disk->zone_wplugs_wq)
+ goto destroy_pool;
+
+ return 0;
+
+destroy_pool:
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+free_hash:
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+ return -ENOMEM;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned int i;
+
+ if (!disk->zone_wplugs_hash)
+ return;
+
+ /* Free all the zone write plugs we have. */
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+ zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+ struct blk_zone_wplug, node);
+ atomic_inc(&zwplug->ref);
+ disk_remove_zone_wplug(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+ cancel_work_sync(&disk->zone_wplugs_work);
+
+ if (disk->zone_wplugs_wq) {
+ destroy_workqueue(disk->zone_wplugs_wq);
+ disk->zone_wplugs_wq = NULL;
+ }
+
+ disk_destroy_zone_wplugs_hash_table(disk);
+
+ /*
+ * Wait for the zone write plugs to be RCU-freed before
+ * destorying the mempool.
+ */
+ rcu_barrier();
+
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+
kfree(disk->conv_zones_bitmap);
disk->conv_zones_bitmap = NULL;
- kfree(disk->seq_zones_wlock);
- disk->seq_zones_wlock = NULL;
+ disk->zone_capacity = 0;
+ disk->nr_zones = 0;
+}
+
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+ /*
+ * All mq zoned devices need zone resources so that the block layer
+ * can automatically handle write BIO plugging. BIO-based device drivers
+ * (e.g. DM devices) are normally responsible for handling zone write
+ * ordering and do not need zone resources, unless the driver requires
+ * zone append emulation.
+ */
+ return queue_is_mq(disk->queue) ||
+ queue_emulates_zone_append(disk->queue);
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+ unsigned int nr_zones)
+{
+ struct queue_limits *lim = &disk->queue->limits;
+ unsigned int pool_size;
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+ */
+ pool_size = max(lim->max_open_zones, lim->max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+ if (!disk->zone_wplugs_hash)
+ return disk_alloc_zone_resources(disk, pool_size);
+
+ return 0;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
unsigned int nr_zones;
+ unsigned int zone_capacity;
sector_t sector;
};
/*
+ * Update the disk zone resources information and device queue limits.
+ * The disk queue is frozen when this is executed.
+ */
+static int disk_update_zone_resources(struct gendisk *disk,
+ struct blk_revalidate_zone_args *args)
+{
+ struct request_queue *q = disk->queue;
+ unsigned int nr_seq_zones, nr_conv_zones = 0;
+ unsigned int pool_size;
+ struct queue_limits lim;
+
+ disk->nr_zones = args->nr_zones;
+ disk->zone_capacity = args->zone_capacity;
+ swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
+ if (disk->conv_zones_bitmap)
+ nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
+ disk->nr_zones);
+ if (nr_conv_zones >= disk->nr_zones) {
+ pr_warn("%s: Invalid number of conventional zones %u / %u\n",
+ disk->disk_name, nr_conv_zones, disk->nr_zones);
+ return -ENODEV;
+ }
+
+ if (!disk->zone_wplugs_pool)
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, set its max open zone limit to the mempool size to indicate
+ * to the user that there is a potential performance impact due to
+ * dynamic zone write plug allocation when simultaneously writing to
+ * more zones than the size of the mempool.
+ */
+ lim = queue_limits_start_update(q);
+
+ nr_seq_zones = disk->nr_zones - nr_conv_zones;
+ pool_size = max(lim.max_open_zones, lim.max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
+
+ mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+ if (!lim.max_open_zones && !lim.max_active_zones) {
+ if (pool_size < nr_seq_zones)
+ lim.max_open_zones = pool_size;
+ else
+ lim.max_open_zones = 0;
+ }
+
+ return queue_limits_commit_update(q, &lim);
+}
+
+static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+ struct request_queue *q = disk->queue;
+
+ if (zone->capacity != zone->len) {
+ pr_warn("%s: Invalid conventional zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+
+ set_bit(idx, args->conv_zones_bitmap);
+
+ return 0;
+}
+
+static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int wp_offset;
+ unsigned long flags;
+
+ /*
+ * Remember the capacity of the first sequential zone and check
+ * if it is constant for all zones.
+ */
+ if (!args->zone_capacity)
+ args->zone_capacity = zone->capacity;
+ if (zone->capacity != args->zone_capacity) {
+ pr_warn("%s: Invalid variable zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ /*
+ * We need to track the write pointer of all zones that are not
+ * empty nor full. So make sure we have a zone write plug for
+ * such zone if the device has a zone write plug hash table.
+ */
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ wp_offset = blk_zone_wp_offset(zone);
+ if (!wp_offset || wp_offset >= zone->capacity)
+ return 0;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+ if (!zwplug)
+ return -ENOMEM;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+
+ return 0;
+}
+
+/*
* Helper function to check the validity of zones of a zoned block device.
*/
static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
@@ -449,9 +1732,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
{
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
- struct request_queue *q = disk->queue;
sector_t capacity = get_capacity(disk);
- sector_t zone_sectors = q->limits.chunk_sectors;
+ sector_t zone_sectors = disk->queue->limits.chunk_sectors;
+ int ret;
/* Check for bad zones and holes in the zone report */
if (zone->start != args->sector) {
@@ -482,66 +1765,57 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
+ if (!zone->capacity || zone->capacity > zone->len) {
+ pr_warn("%s: Invalid zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!args->conv_zones_bitmap) {
- args->conv_zones_bitmap =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, args->conv_zones_bitmap);
+ ret = blk_revalidate_conv_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- if (!args->seq_zones_wlock) {
- args->seq_zones_wlock =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- }
+ ret = blk_revalidate_seq_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
- return -ENODEV;
+ ret = -ENODEV;
}
- args->sector += zone->len;
- return 0;
+ if (!ret)
+ args->sector += zone->len;
+
+ return ret;
}
/**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
* @disk: Target disk
- * @update_driver_data: Callback to update driver data on the frozen disk
*
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
* Before calling this function, the device driver must already have set the
* device zone size (chunk_sector limit) and the max zone append limit.
- * For BIO based drivers, this function cannot be used. BIO based device drivers
- * only need to set disk->nr_zones so that the sysfs exposed value is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
+ * BIO based drivers can also use this function as long as the device queue
+ * can be safely frozen.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk))
+int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
sector_t zone_sectors = q->limits.chunk_sectors;
sector_t capacity = get_capacity(disk);
struct blk_revalidate_zone_args args = { };
unsigned int noio_flag;
- int ret;
+ int ret = -ENOMEM;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- return -EIO;
if (!capacity)
return -ENODEV;
@@ -556,7 +1830,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return -ENODEV;
}
- if (!q->limits.max_zone_append_sectors) {
+ if (!queue_max_zone_append_sectors(q)) {
pr_warn("%s: Invalid 0 maximum zone append limit\n",
disk->disk_name);
return -ENODEV;
@@ -569,6 +1843,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
args.disk = disk;
args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
+ ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+ if (ret) {
+ memalloc_noio_restore(noio_flag);
+ return ret;
+ }
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
blk_revalidate_zone_cb, &args);
if (!ret) {
@@ -588,26 +1867,59 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
/*
- * Install the new bitmaps and update nr_zones only once the queue is
- * stopped and all I/Os are completed (i.e. a scheduler is not
- * referencing the bitmaps).
+ * Set the new disk zone parameters only once the queue is frozen and
+ * all I/Os are completed.
*/
blk_mq_freeze_queue(q);
- if (ret > 0) {
- disk->nr_zones = args.nr_zones;
- swap(disk->seq_zones_wlock, args.seq_zones_wlock);
- swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
- if (update_driver_data)
- update_driver_data(disk);
- ret = 0;
- } else {
+ if (ret > 0)
+ ret = disk_update_zone_resources(disk, &args);
+ else
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- disk_free_zone_bitmaps(disk);
- }
+ if (ret)
+ disk_free_zone_resources(disk);
blk_mq_unfreeze_queue(q);
- kfree(args.seq_zones_wlock);
kfree(args.conv_zones_bitmap);
+
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+
+int queue_zone_wplugs_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct gendisk *disk = q->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int zwp_wp_offset, zwp_flags;
+ unsigned int zwp_zone_no, zwp_ref;
+ unsigned int zwp_bio_list_size, i;
+ unsigned long flags;
+
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ rcu_read_lock();
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ hlist_for_each_entry_rcu(zwplug,
+ &disk->zone_wplugs_hash[i], node) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwp_zone_no = zwplug->zone_no;
+ zwp_flags = zwplug->flags;
+ zwp_ref = atomic_read(&zwplug->ref);
+ zwp_wp_offset = zwplug->wp_offset;
+ zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ seq_printf(m, "%u 0x%x %u %u %u\n",
+ zwp_zone_no, zwp_flags, zwp_ref,
+ zwp_wp_offset, zwp_bio_list_size);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index d9f584984bc44b..ee4f782d149662 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -269,6 +269,14 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
@@ -407,13 +415,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
}
#ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ bio->bi_iter.bi_sector = rq->__sector;
+}
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+ /*
+ * For write BIOs to zoned devices, signal the completion of the BIO so
+ * that the next write BIO can be submitted by zone write plugging.
+ */
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_bio_endio(bio);
+}
+
+void blk_zone_write_plug_finish_request(struct request *rq);
+static inline void blk_zone_finish_request(struct request *rq)
+{
+ if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_finish_request(rq);
+}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return false;
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_init_request(struct request *rq)
+{
+}
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+}
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_finish_request(struct request *rq)
+{
+}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8f5..f64ebd726e588a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(struct request_queue *q,
- const struct elevator_type *e)
-{
- return (q->required_elevator_features & e->elevator_features) ==
- q->required_elevator_features;
-}
-
/**
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
spin_lock(&elv_list_lock);
e = __elevator_find(name);
- if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+ if (e && (!elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
}
/*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
- struct elevator_type *e, *found = NULL;
-
- spin_lock(&elv_list_lock);
-
- list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(q, e)) {
- found = e;
- break;
- }
- }
-
- if (found && !elevator_tryget(found))
- found = NULL;
-
- spin_unlock(&elv_list_lock);
- return found;
-}
-
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
*/
void elevator_init_mq(struct request_queue *q)
{
@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
if (unlikely(q->elevator))
return;
- if (!q->required_elevator_features)
- e = elevator_get_default(q);
- else
- e = elevator_get_by_features(q);
+ e = elevator_get_default(q);
if (!e)
return;
@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
- else if (elv_support_features(q, e))
+ else
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed8289..e9a050a96e5305 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -74,7 +74,6 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
- const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
diff --git a/block/fops.c b/block/fops.c
index 040743a3b43d27..5d4ef034fffe65 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+ return -EINVAL;
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/block/genhd.c b/block/genhd.c
index b294d56961fba4..6b0447a35d7702 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
- disk_free_zone_bitmaps(disk);
+ disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1364,6 +1364,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (blkcg_init_disk(disk))
goto out_erase_part0;
+ disk_init_zone_resources(disk);
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee75..94eede4fb9ebed 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -102,7 +102,6 @@ struct deadline_data {
int prio_aging_expire;
spinlock_t lock;
- spinlock_t zone_lock;
};
/* Maps an I/O priority class to a deadline scheduler priority. */
@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
- * get the request before `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_earlier_request(struct request *rq)
-{
- struct rb_node *node = rb_prev(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
- struct rb_node *node = rb_next(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
- /*
- * A zoned write may have been requeued with a starting position that
- * is below that of the most recently dispatched request. Hence, for
- * zoned writes, start searching from the start of a zone.
- */
- if (blk_rq_is_seq_zoned_write(rq))
- pos = round_down(pos, rq->q->limits.chunk_sectors);
-
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -309,36 +271,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
- struct request *prev = deadline_earlier_request(rq);
-
- if (!prev)
- return false;
-
- return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
- struct request *rq)
-{
- sector_t pos = blk_rq_pos(rq);
-
- do {
- pos += blk_rq_sectors(rq);
- rq = deadline_latter_request(rq);
- } while (rq && blk_rq_pos(rq) == pos);
-
- return rq;
-}
-
-/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -346,40 +278,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq, *rb_rq, *next;
- unsigned long flags;
-
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
- queuelist) {
- /* Check whether a prior request exists for the same zone. */
- rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
- if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
- rq = rb_rq;
- if (blk_req_can_dispatch_to_zone(rq) &&
- (blk_queue_nonrot(rq->q) ||
- !deadline_is_seq_write(dd, rq)))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
@@ -390,36 +292,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
- unsigned long flags;
-
- rq = deadline_from_pos(per_prio, data_dir,
- per_prio->latest_pos[data_dir]);
- if (!rq)
- return NULL;
-
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- if (blk_queue_nonrot(rq->q))
- rq = deadline_latter_request(rq);
- else
- rq = deadline_skip_seq_writes(dd, rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
}
/*
@@ -525,10 +399,6 @@ dispatch_find_request:
rq = next_rq;
}
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
if (!rq)
return NULL;
@@ -549,10 +419,6 @@ done:
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
@@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
- spin_lock_init(&dd->zone_lock);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
lockdep_assert_held(&dd->lock);
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
@@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
- /*
- * Insert zoned writes such that requests are sorted by
- * position per zone.
- */
- if (blk_rq_is_seq_zoned_write(rq)) {
- struct request *rq2 = deadline_latter_request(rq);
-
- if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
- insert_before = &rq2->queuelist;
- }
-#endif
list_add_tail(&rq->queuelist, insert_before);
}
}
@@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- enum dd_prio p;
-
- for (p = 0; p <= DD_PRIO_MAX; p++)
- if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
- return true;
-
- return false;
-}
-
/*
* Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq)
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (!rq->elv.priv[0])
- return;
-
- atomic_inc(&per_prio->stats.completed);
-
- if (blk_queue_is_zoned(q)) {
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- if (dd_has_write_work(rq->mq_hctx))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- }
+ if (rq->elv.priv[0])
+ atomic_inc(&per_prio->stats.completed);
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index e322cef6596bfa..b900fe9e003080 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -29,10 +29,7 @@
/*
* Each block ramdisk device has a xarray brd_pages of pages that stores
- * the pages containing the block device's contents. A brd page's ->index is
- * its offset in PAGE_SIZE units. This is similar to, but in no way connected
- * with, the kernel's pagecache or buffer cache (which sit above our block
- * device).
+ * the pages containing the block device's contents.
*/
struct brd_device {
int brd_number;
@@ -51,15 +48,7 @@ struct brd_device {
*/
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
- pgoff_t idx;
- struct page *page;
-
- idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
- page = xa_load(&brd->brd_pages, idx);
-
- BUG_ON(page && page->index != idx);
-
- return page;
+ return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
}
/*
@@ -67,8 +56,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
*/
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{
- pgoff_t idx;
- struct page *page, *cur;
+ pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
+ struct page *page;
int ret = 0;
page = brd_lookup_page(brd, sector);
@@ -80,23 +69,16 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
return -ENOMEM;
xa_lock(&brd->brd_pages);
-
- idx = sector >> PAGE_SECTORS_SHIFT;
- page->index = idx;
-
- cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
-
- if (unlikely(cur)) {
- __free_page(page);
- ret = xa_err(cur);
- if (!ret && (cur->index != idx))
- ret = -EIO;
- } else {
+ ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
+ if (!ret)
brd->brd_nr_pages++;
- }
-
xa_unlock(&brd->brd_pages);
+ if (ret < 0) {
+ __free_page(page);
+ if (ret == -EBUSY)
+ ret = 0;
+ }
return ret;
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index ed33cf7192d216..241bcfad20cf6e 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -225,6 +225,10 @@ static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+static bool g_fua = true;
+module_param_named(fua, g_fua, bool, 0444);
+MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+
static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
@@ -253,6 +257,11 @@ static unsigned int g_zone_max_active;
module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
+static int g_zone_append_max_sectors = INT_MAX;
+module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+MODULE_PARM_DESC(zone_append_max_sectors,
+ "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
@@ -436,10 +445,12 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+NULLB_DEVICE_ATTR(fua, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -580,12 +591,14 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
+ &nullb_device_attr_zone_append_max_sectors,
&nullb_device_attr_zone_readonly,
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap,
+ &nullb_device_attr_fua,
NULL,
};
@@ -664,14 +677,14 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE,
- "badblocks,blocking,blocksize,cache_size,"
+ "badblocks,blocking,blocksize,cache_size,fua,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,"
"shared_tags,size,submit_queues,use_per_node_hctx,"
"virt_boundary,zoned,zone_capacity,zone_max_active,"
"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
- "zone_size\n");
+ "zone_size,zone_append_max_sectors\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -751,10 +764,13 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_nr_conv = g_zone_nr_conv;
dev->zone_max_open = g_zone_max_open;
dev->zone_max_active = g_zone_max_active;
+ dev->zone_append_max_sectors = g_zone_append_max_sectors;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
+ dev->fua = g_fua;
+
return dev;
}
@@ -1151,7 +1167,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev,
return BLK_STS_OK;
}
-static int null_handle_flush(struct nullb *nullb)
+static blk_status_t null_handle_flush(struct nullb *nullb)
{
int err;
@@ -1168,7 +1184,7 @@ static int null_handle_flush(struct nullb *nullb)
WARN_ON(!radix_tree_empty(&nullb->dev->cache));
spin_unlock_irq(&nullb->lock);
- return err;
+ return errno_to_blk_status(err);
}
static int null_transfer(struct nullb *nullb, struct page *page,
@@ -1206,7 +1222,7 @@ static int null_handle_rq(struct nullb_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
- int err;
+ int err = 0;
unsigned int len;
sector_t sector = blk_rq_pos(rq);
struct req_iterator iter;
@@ -1218,15 +1234,13 @@ static int null_handle_rq(struct nullb_cmd *cmd)
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(req_op(rq)), sector,
rq->cmd_flags & REQ_FUA);
- if (err) {
- spin_unlock_irq(&nullb->lock);
- return err;
- }
+ if (err)
+ break;
sector += len >> SECTOR_SHIFT;
}
spin_unlock_irq(&nullb->lock);
- return 0;
+ return errno_to_blk_status(err);
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@ -1273,8 +1287,8 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
- return errno_to_blk_status(null_handle_rq(cmd));
+ return null_handle_rq(cmd);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@ -1343,7 +1357,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
blk_status_t sts;
if (op == REQ_OP_FLUSH) {
- cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+ cmd->error = null_handle_flush(nullb);
goto out;
}
@@ -1912,7 +1926,7 @@ static int null_add_dev(struct nullb_device *dev)
if (dev->cache_size > 0) {
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
- blk_queue_write_cache(nullb->q, true, true);
+ blk_queue_write_cache(nullb->q, true, dev->fua);
}
nullb->q->queuedata = nullb;
@@ -2113,6 +2127,8 @@ static void __exit null_exit(void)
if (tag_set.ops)
blk_mq_free_tag_set(&tag_set);
+
+ mutex_destroy(&lock);
}
module_init(null_init);
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 477b9774682346..3234e6c85eed8c 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -82,6 +82,7 @@ struct nullb_device {
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int zone_max_open; /* max number of open zones */
unsigned int zone_max_active; /* max number of active zones */
+ unsigned int zone_append_max_sectors; /* Max sectors per zone append command */
unsigned int submit_queues; /* number of submission queues */
unsigned int prev_submit_queues; /* number of submission queues before change */
unsigned int poll_queues; /* number of IOPOLL submission queues */
@@ -104,6 +105,7 @@ struct nullb_device {
bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
+ bool fua; /* Support FUA */
};
struct nullb {
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 1689e258410483..5b5a63adacc17c 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -9,6 +9,8 @@
#undef pr_fmt
#define pr_fmt(fmt) "null_blk: " fmt
+#define NULL_ZONE_INVALID_WP ((sector_t)-1)
+
static inline sector_t mb_to_sects(unsigned long mb)
{
return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT;
@@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
return sect >> ilog2(dev->zone_size_sects);
}
-static inline void null_lock_zone_res(struct nullb_device *dev)
-{
- if (dev->need_zone_res_mgmt)
- spin_lock_irq(&dev->zone_res_lock);
-}
-
-static inline void null_unlock_zone_res(struct nullb_device *dev)
-{
- if (dev->need_zone_res_mgmt)
- spin_unlock_irq(&dev->zone_res_lock);
-}
-
static inline void null_init_zone_lock(struct nullb_device *dev,
struct nullb_zone *zone)
{
@@ -103,6 +93,11 @@ int null_init_zoned_dev(struct nullb_device *dev,
dev->zone_nr_conv);
}
+ dev->zone_append_max_sectors =
+ min(ALIGN_DOWN(dev->zone_append_max_sectors,
+ dev->blocksize >> SECTOR_SHIFT),
+ zone_capacity_sects);
+
/* Max active zones has to be < nbr of seq zones in order to be enforceable */
if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
dev->zone_max_active = 0;
@@ -154,7 +149,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
lim->zoned = true;
lim->chunk_sectors = dev->zone_size_sects;
- lim->max_zone_append_sectors = dev->zone_size_sects;
+ lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
lim->max_active_zones = dev->zone_max_active;
return 0;
@@ -163,11 +158,16 @@ int null_init_zoned_dev(struct nullb_device *dev,
int null_register_zoned_dev(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
+ struct gendisk *disk = nullb->disk;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
- nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
- return blk_revalidate_disk_zones(nullb->disk, NULL);
+ disk->nr_zones = bdev_nr_zones(disk->part0);
+
+ pr_info("%s: using %s zone append\n",
+ disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
+
+ return blk_revalidate_disk_zones(disk);
}
void null_free_zoned_dev(struct nullb_device *dev)
@@ -241,35 +241,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
return (zone->wp - sector) << SECTOR_SHIFT;
}
-static blk_status_t __null_close_zone(struct nullb_device *dev,
- struct nullb_zone *zone)
-{
- switch (zone->cond) {
- case BLK_ZONE_COND_CLOSED:
- /* close operation on closed is not an error */
- return BLK_STS_OK;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_FULL:
- default:
- return BLK_STS_IOERR;
- }
-
- if (zone->wp == zone->start) {
- zone->cond = BLK_ZONE_COND_EMPTY;
- } else {
- zone->cond = BLK_ZONE_COND_CLOSED;
- dev->nr_zones_closed++;
- }
-
- return BLK_STS_OK;
-}
-
static void null_close_imp_open_zone(struct nullb_device *dev)
{
struct nullb_zone *zone;
@@ -286,7 +257,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev)
zno = dev->zone_nr_conv;
if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
- __null_close_zone(dev, zone);
+ dev->nr_zones_imp_open--;
+ if (zone->wp == zone->start) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ dev->nr_zones_closed++;
+ }
dev->imp_close_zone_no = zno;
return;
}
@@ -374,73 +351,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
null_lock_zone(dev, zone);
- if (zone->cond == BLK_ZONE_COND_FULL ||
- zone->cond == BLK_ZONE_COND_READONLY ||
- zone->cond == BLK_ZONE_COND_OFFLINE) {
- /* Cannot write to the zone */
- ret = BLK_STS_IOERR;
- goto unlock;
- }
-
/*
- * Regular writes must be at the write pointer position.
- * Zone append writes are automatically issued at the write
- * pointer and the position returned using the request or BIO
- * sector.
+ * Regular writes must be at the write pointer position. Zone append
+ * writes are automatically issued at the write pointer and the position
+ * returned using the request sector. Note that we do not check the zone
+ * condition because for FULL, READONLY and OFFLINE zones, the sector
+ * check against the zone write pointer will always result in failing
+ * the command.
*/
if (append) {
+ if (WARN_ON_ONCE(!dev->zone_append_max_sectors) ||
+ zone->wp == NULL_ZONE_INVALID_WP) {
+ ret = BLK_STS_IOERR;
+ goto unlock_zone;
+ }
sector = zone->wp;
blk_mq_rq_from_pdu(cmd)->__sector = sector;
- } else if (sector != zone->wp) {
- ret = BLK_STS_IOERR;
- goto unlock;
}
- if (zone->wp + nr_sectors > zone->start + zone->capacity) {
+ if (sector != zone->wp ||
+ zone->wp + nr_sectors > zone->start + zone->capacity) {
ret = BLK_STS_IOERR;
- goto unlock;
+ goto unlock_zone;
}
if (zone->cond == BLK_ZONE_COND_CLOSED ||
zone->cond == BLK_ZONE_COND_EMPTY) {
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK) {
- null_unlock_zone_res(dev);
- goto unlock;
- }
- if (zone->cond == BLK_ZONE_COND_CLOSED) {
- dev->nr_zones_closed--;
- dev->nr_zones_imp_open++;
- } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
- dev->nr_zones_imp_open++;
- }
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ goto unlock_zone;
+ }
+ if (zone->cond == BLK_ZONE_COND_CLOSED) {
+ dev->nr_zones_closed--;
+ dev->nr_zones_imp_open++;
+ } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
+ dev->nr_zones_imp_open++;
+ }
- if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
+ spin_unlock(&dev->zone_res_lock);
+ }
- null_unlock_zone_res(dev);
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
}
ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
if (ret != BLK_STS_OK)
- goto unlock;
+ goto unlock_zone;
zone->wp += nr_sectors;
if (zone->wp == zone->start + zone->capacity) {
- null_lock_zone_res(dev);
- if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
- dev->nr_zones_exp_open--;
- else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
- dev->nr_zones_imp_open--;
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
+ if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
+ dev->nr_zones_exp_open--;
+ else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
+ dev->nr_zones_imp_open--;
+ spin_unlock(&dev->zone_res_lock);
+ }
zone->cond = BLK_ZONE_COND_FULL;
- null_unlock_zone_res(dev);
}
ret = BLK_STS_OK;
-unlock:
+unlock_zone:
null_unlock_zone(dev, zone);
return ret;
@@ -454,54 +431,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
-
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
- /* open operation on exp open is not an error */
- goto unlock;
+ /* Open operation on exp open is not an error */
+ return BLK_STS_OK;
case BLK_ZONE_COND_EMPTY:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- break;
case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
case BLK_ZONE_COND_CLOSED:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- dev->nr_zones_closed--;
break;
case BLK_ZONE_COND_FULL:
default:
- ret = BLK_STS_IOERR;
- goto unlock;
+ return BLK_STS_IOERR;
}
- zone->cond = BLK_ZONE_COND_EXP_OPEN;
- dev->nr_zones_exp_open++;
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
-unlock:
- null_unlock_zone_res(dev);
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ dev->nr_zones_closed--;
+ break;
+ default:
+ break;
+ }
- return ret;
+ dev->nr_zones_exp_open++;
+
+ spin_unlock(&dev->zone_res_lock);
+ }
+
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+
+ return BLK_STS_OK;
}
static blk_status_t null_close_zone(struct nullb_device *dev,
struct nullb_zone *zone)
{
- blk_status_t ret;
-
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
- ret = __null_close_zone(dev, zone);
- null_unlock_zone_res(dev);
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ /* close operation on closed is not an error */
+ return BLK_STS_OK;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ return BLK_STS_IOERR;
+ }
+
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- return ret;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ default:
+ break;
+ }
+
+ if (zone->wp > zone->start)
+ dev->nr_zones_closed++;
+
+ spin_unlock(&dev->zone_res_lock);
+ }
+
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+
+ return BLK_STS_OK;
}
static blk_status_t null_finish_zone(struct nullb_device *dev,
@@ -512,41 +535,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- switch (zone->cond) {
- case BLK_ZONE_COND_FULL:
- /* finish operation on full is not an error */
- goto unlock;
- case BLK_ZONE_COND_EMPTY:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- break;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_CLOSED:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- dev->nr_zones_closed--;
- break;
- default:
- ret = BLK_STS_IOERR;
- goto unlock;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_FULL:
+ /* Finish operation on full is not an error */
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_OK;
+ case BLK_ZONE_COND_EMPTY:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ dev->nr_zones_closed--;
+ break;
+ default:
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_IOERR;
+ }
+
+ spin_unlock(&dev->zone_res_lock);
}
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = zone->start + zone->len;
-unlock:
- null_unlock_zone_res(dev);
-
- return ret;
+ return BLK_STS_OK;
}
static blk_status_t null_reset_zone(struct nullb_device *dev,
@@ -555,34 +584,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- switch (zone->cond) {
- case BLK_ZONE_COND_EMPTY:
- /* reset operation on empty is not an error */
- null_unlock_zone_res(dev);
- return BLK_STS_OK;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_CLOSED:
- dev->nr_zones_closed--;
- break;
- case BLK_ZONE_COND_FULL:
- break;
- default:
- null_unlock_zone_res(dev);
- return BLK_STS_IOERR;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ dev->nr_zones_closed--;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ break;
+ default:
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_IOERR;
+ }
+
+ spin_unlock(&dev->zone_res_lock);
}
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
- null_unlock_zone_res(dev);
-
if (dev->memory_backed)
return null_handle_discard(dev, zone->start, zone->len);
@@ -711,7 +739,7 @@ static void null_set_zone_cond(struct nullb_device *dev,
zone->cond != BLK_ZONE_COND_OFFLINE)
null_finish_zone(dev, zone);
zone->cond = cond;
- zone->wp = (sector_t)-1;
+ zone->wp = NULL_ZONE_INVALID_WP;
}
null_unlock_zone(dev, zone);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index bea3d5cf8a8348..851c78913de2b2 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub)
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
- return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+ return blk_revalidate_disk_zones(ub->ub_disk);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
@@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
- blk_queue_required_elevator_features(ub->ub_disk->queue,
- ELEVATOR_F_ZBD_SEQ_WRITE);
+
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42dea7601d8799..c1af0a7d56c80a 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev)
*/
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
- err = blk_revalidate_disk_zones(vblk->disk, NULL);
+ err = blk_revalidate_disk_zones(vblk->disk);
if (err)
goto out_cleanup_disk;
}
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index fd852981ef9ccb..cf433b0cf74275 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -321,8 +321,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
{
BUG_ON(!cell->exclusive_lock);
- bio_list_merge(bios, &cell->bios);
- bio_list_init(&cell->bios);
+ bio_list_merge_init(bios, &cell->bios);
if (cell->shared_count) {
cell->exclusive_lock = false;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 911f73f7ebbaa0..0fcbf8603846b5 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -115,8 +115,7 @@ static void __commit(struct work_struct *_ws)
*/
spin_lock_irq(&b->lock);
list_splice_init(&b->work_items, &work_items);
- bio_list_merge(&bios, &b->bios);
- bio_list_init(&b->bios);
+ bio_list_merge_init(&bios, &b->bios);
b->commit_scheduled = false;
spin_unlock_irq(&b->lock);
@@ -565,8 +564,7 @@ static void defer_bio(struct cache *cache, struct bio *bio)
static void defer_bios(struct cache *cache, struct bio_list *bios)
{
spin_lock_irq(&cache->lock);
- bio_list_merge(&cache->deferred_bios, bios);
- bio_list_init(bios);
+ bio_list_merge_init(&cache->deferred_bios, bios);
spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
@@ -1816,8 +1814,7 @@ static void process_deferred_bios(struct work_struct *ws)
bio_list_init(&bios);
spin_lock_irq(&cache->lock);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
+ bio_list_merge_init(&bios, &cache->deferred_bios);
spin_unlock_irq(&cache->lock);
while ((bio = bio_list_pop(&bios))) {
@@ -1847,8 +1844,7 @@ static void requeue_deferred_bios(struct cache *cache)
struct bio_list bios;
bio_list_init(&bios);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
+ bio_list_merge_init(&bios, &cache->deferred_bios);
while ((bio = bio_list_pop(&bios))) {
bio->bi_status = BLK_STS_DM_REQUEUE;
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 94b2fc33f64be3..3f68672ab7c938 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1181,8 +1181,7 @@ static void process_deferred_discards(struct clone *clone)
struct bio_list discards = BIO_EMPTY_LIST;
spin_lock_irq(&clone->lock);
- bio_list_merge(&discards, &clone->deferred_discard_bios);
- bio_list_init(&clone->deferred_discard_bios);
+ bio_list_merge_init(&discards, &clone->deferred_discard_bios);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&discards))
@@ -1215,8 +1214,7 @@ static void process_deferred_bios(struct clone *clone)
struct bio_list bios = BIO_EMPTY_LIST;
spin_lock_irq(&clone->lock);
- bio_list_merge(&bios, &clone->deferred_bios);
- bio_list_init(&clone->deferred_bios);
+ bio_list_merge_init(&bios, &clone->deferred_bios);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios))
@@ -1237,11 +1235,9 @@ static void process_deferred_flush_bios(struct clone *clone)
* before issuing them or signaling their completion.
*/
spin_lock_irq(&clone->lock);
- bio_list_merge(&bios, &clone->deferred_flush_bios);
- bio_list_init(&clone->deferred_flush_bios);
-
- bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
- bio_list_init(&clone->deferred_flush_completions);
+ bio_list_merge_init(&bios, &clone->deferred_flush_bios);
+ bio_list_merge_init(&bio_completions,
+ &clone->deferred_flush_completions);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dccad1..08700bfc3e2343 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -140,7 +140,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
- unsigned int *zwp_offset;
+ void *zone_revalidate_map;
#endif
#ifdef CONFIG_IMA
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 6acfa5bf97a400..8f81e597858d5c 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1272,8 +1272,7 @@ static void process_deferred_bios(struct era *era)
bio_list_init(&marked_bios);
spin_lock(&era->deferred_lock);
- bio_list_merge(&deferred_bios, &era->deferred_bios);
- bio_list_init(&era->deferred_bios);
+ bio_list_merge_init(&deferred_bios, &era->deferred_bios);
spin_unlock(&era->deferred_lock);
if (bio_list_empty(&deferred_bios))
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 05d1328d18119c..15b681b901531f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -704,8 +704,7 @@ static void process_queued_bios(struct work_struct *work)
return;
}
- bio_list_merge(&bios, &m->queued_bios);
- bio_list_init(&m->queued_bios);
+ bio_list_merge_init(&bios, &m->queued_bios);
spin_unlock_irqrestore(&m->lock, flags);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5ac2..2c6fbd87363f29 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2042,7 +2042,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
r = dm_set_zones_restrictions(t, q);
if (r)
return r;
- if (!static_key_enabled(&zoned_enabled.key))
+ if (blk_queue_is_zoned(q) &&
+ !static_key_enabled(&zoned_enabled.key))
static_branch_enable(&zoned_enabled);
}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 4793ad2aa1f7e8..f359984c8ef27e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -592,12 +592,6 @@ struct dm_thin_endio_hook {
struct dm_bio_prison_cell *cell;
};
-static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
-{
- bio_list_merge(bios, master);
- bio_list_init(master);
-}
-
static void error_bio_list(struct bio_list *bios, blk_status_t error)
{
struct bio *bio;
@@ -616,7 +610,7 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
bio_list_init(&bios);
spin_lock_irq(&tc->lock);
- __merge_bio_list(&bios, master);
+ bio_list_merge_init(&bios, master);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, error);
@@ -645,8 +639,8 @@ static void requeue_io(struct thin_c *tc)
bio_list_init(&bios);
spin_lock_irq(&tc->lock);
- __merge_bio_list(&bios, &tc->deferred_bio_list);
- __merge_bio_list(&bios, &tc->retry_on_resume_list);
+ bio_list_merge_init(&bios, &tc->deferred_bio_list);
+ bio_list_merge_init(&bios, &tc->retry_on_resume_list);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, BLK_STS_DM_REQUEUE);
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 94f6f1ccfb7d6d..ab3ea833780946 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -604,8 +604,7 @@ static void assign_discard_permit(struct limiter *limiter)
static void get_waiters(struct limiter *limiter)
{
- bio_list_merge(&limiter->waiters, &limiter->new_waiters);
- bio_list_init(&limiter->new_waiters);
+ bio_list_merge_init(&limiter->waiters, &limiter->new_waiters);
}
static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool)
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 57e87f0d706970..dd4fdee2ca0c5a 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -369,8 +369,7 @@ void vdo_dump_flusher(const struct flusher *flusher)
static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo)
{
bio_list_init(&flush->bios);
- bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios);
- bio_list_init(&vdo->flusher->waiting_flush_bios);
+ bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios);
}
static void launch_flush(struct vdo_flush *flush)
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index eb9832b22b14e2..8e6bcb0d786a1a 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -60,16 +60,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
struct dm_table *map;
int srcu_idx, ret;
- if (dm_suspended_md(md))
- return -EAGAIN;
+ if (!md->zone_revalidate_map) {
+ /* Regular user context */
+ if (dm_suspended_md(md))
+ return -EAGAIN;
- map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return -EIO;
+ } else {
+ /* Zone revalidation during __bind() */
+ map = md->zone_revalidate_map;
+ }
ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
- dm_put_live_table(md, srcu_idx);
+ if (!md->zone_revalidate_map)
+ dm_put_live_table(md, srcu_idx);
return ret;
}
@@ -138,80 +145,47 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
}
}
-void dm_cleanup_zoned_dev(struct mapped_device *md)
+/*
+ * Count conventional zones of a mapped zoned device. If the device
+ * only has conventional zones, do not expose it as zoned.
+ */
+static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
{
- if (md->disk) {
- bitmap_free(md->disk->conv_zones_bitmap);
- md->disk->conv_zones_bitmap = NULL;
- bitmap_free(md->disk->seq_zones_wlock);
- md->disk->seq_zones_wlock = NULL;
- }
+ unsigned int *nr_conv_zones = data;
- kvfree(md->zwp_offset);
- md->zwp_offset = NULL;
- md->nr_zones = 0;
-}
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+ (*nr_conv_zones)++;
-static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
-{
- switch (zone->cond) {
- case BLK_ZONE_COND_IMP_OPEN:
- case BLK_ZONE_COND_EXP_OPEN:
- case BLK_ZONE_COND_CLOSED:
- return zone->wp - zone->start;
- case BLK_ZONE_COND_FULL:
- return zone->len;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_NOT_WP:
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- default:
- /*
- * Conventional, offline and read-only zones do not have a valid
- * write pointer. Use 0 as for an empty zone.
- */
- return 0;
- }
+ return 0;
}
-static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
+static int dm_check_zoned(struct mapped_device *md, struct dm_table *t)
{
- struct mapped_device *md = data;
struct gendisk *disk = md->disk;
+ unsigned int nr_conv_zones = 0;
+ int ret;
- switch (zone->type) {
- case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!disk->conv_zones_bitmap) {
- disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, disk->conv_zones_bitmap);
- break;
- case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
- if (!disk->seq_zones_wlock) {
- disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->seq_zones_wlock)
- return -ENOMEM;
- }
- if (!md->zwp_offset) {
- md->zwp_offset =
- kvcalloc(disk->nr_zones, sizeof(unsigned int),
- GFP_KERNEL);
- if (!md->zwp_offset)
- return -ENOMEM;
- }
- md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
-
- break;
- default:
- DMERR("Invalid zone type 0x%x at sectors %llu",
- (int)zone->type, zone->start);
- return -ENODEV;
+ /* Count conventional zones */
+ md->zone_revalidate_map = t;
+ ret = dm_blk_report_zones(disk, 0, UINT_MAX,
+ dm_check_zoned_cb, &nr_conv_zones);
+ md->zone_revalidate_map = NULL;
+ if (ret < 0) {
+ DMERR("Check zoned failed %d", ret);
+ return ret;
+ }
+
+ /*
+ * If we only have conventional zones, expose the mapped device as
+ * a regular device.
+ */
+ if (nr_conv_zones >= ret) {
+ disk->queue->limits.max_open_zones = 0;
+ disk->queue->limits.max_active_zones = 0;
+ disk->queue->limits.zoned = false;
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ disk->nr_zones = 0;
}
return 0;
@@ -226,41 +200,32 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
{
struct gendisk *disk = md->disk;
- unsigned int noio_flag;
int ret;
- /*
- * Check if something changed. If yes, cleanup the current resources
- * and reallocate everything.
- */
+ /* Revalidate only if something changed. */
if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
- dm_cleanup_zoned_dev(md);
+ md->nr_zones = 0;
+
if (md->nr_zones)
return 0;
/*
- * Scan all zones to initialize everything. Ensure that all vmalloc
- * operations in this context are done as if GFP_NOIO was specified.
+ * Our table is not live yet. So the call to dm_get_live_table()
+ * in dm_blk_report_zones() will fail. Set a temporary pointer to
+ * our table for dm_blk_report_zones() to use directly.
*/
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
- dm_zone_revalidate_cb, md);
- memalloc_noio_restore(noio_flag);
- if (ret < 0)
- goto err;
- if (ret != disk->nr_zones) {
- ret = -EIO;
- goto err;
+ md->zone_revalidate_map = t;
+ ret = blk_revalidate_disk_zones(disk);
+ md->zone_revalidate_map = NULL;
+
+ if (ret) {
+ DMERR("Revalidate zones failed %d", ret);
+ return ret;
}
md->nr_zones = disk->nr_zones;
return 0;
-
-err:
- DMERR("Revalidate zones failed %d", ret);
- dm_cleanup_zoned_dev(md);
- return ret;
}
static int device_not_zone_append_capable(struct dm_target *ti,
@@ -289,294 +254,40 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
{
struct mapped_device *md = t->md;
+ int ret;
/*
- * For a zoned target, the number of zones should be updated for the
- * correct value to be exposed in sysfs queue/nr_zones.
+ * Check if zone append is natively supported, and if not, set the
+ * mapped device queue as needing zone append emulation.
*/
WARN_ON_ONCE(queue_is_mq(q));
- md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
-
- /* Check if zone append is natively supported */
if (dm_table_supports_zone_append(t)) {
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- dm_cleanup_zoned_dev(md);
- return 0;
+ } else {
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ blk_queue_max_zone_append_sectors(q, 0);
}
- /*
- * Mark the mapped device as needing zone append emulation and
- * initialize the emulation resources once the capacity is set.
- */
- set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
if (!get_capacity(md->disk))
return 0;
- return dm_revalidate_zones(md, t);
-}
-
-static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- unsigned int *wp_offset = data;
-
- *wp_offset = dm_get_zone_wp_offset(zone);
-
- return 0;
-}
-
-static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
- unsigned int *wp_ofst)
-{
- sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
- unsigned int noio_flag;
- struct dm_table *t;
- int srcu_idx, ret;
-
- t = dm_get_live_table(md, &srcu_idx);
- if (!t)
- return -EIO;
-
- /*
- * Ensure that all memory allocations in this context are done as if
- * GFP_NOIO was specified.
- */
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, sector, 1,
- dm_update_zone_wp_offset_cb, wp_ofst);
- memalloc_noio_restore(noio_flag);
-
- dm_put_live_table(md, srcu_idx);
-
- if (ret != 1)
- return -EIO;
-
- return 0;
-}
-
-struct orig_bio_details {
- enum req_op op;
- unsigned int nr_sectors;
-};
-
-/*
- * First phase of BIO mapping for targets with zone append emulation:
- * check all BIO that change a zone writer pointer and change zone
- * append operations into regular write operations.
- */
-static bool dm_zone_map_bio_begin(struct mapped_device *md,
- unsigned int zno, struct bio *clone)
-{
- sector_t zsectors = bdev_zone_sectors(md->disk->part0);
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
- /*
- * If the target zone is in an error state, recover by inspecting the
- * zone to get its current write pointer position. Note that since the
- * target zone is already locked, a BIO issuing context should never
- * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
- */
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
- if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
- return false;
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
- }
-
- switch (bio_op(clone)) {
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- return true;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- /* Writes must be aligned to the zone write pointer */
- if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
- return false;
- break;
- case REQ_OP_ZONE_APPEND:
- /*
- * Change zone append operations into a non-mergeable regular
- * writes directed at the current write pointer position of the
- * target zone.
- */
- clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
- (clone->bi_opf & (~REQ_OP_MASK));
- clone->bi_iter.bi_sector += zwp_offset;
- break;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return false;
- }
-
- /* Cannot write to a full zone */
- if (zwp_offset >= zsectors)
- return false;
-
- return true;
-}
-
-/*
- * Second phase of BIO mapping for targets with zone append emulation:
- * update the zone write pointer offset array to account for the additional
- * data written to a zone. Note that at this point, the remapped clone BIO
- * may already have completed, so we do not touch it.
- */
-static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
- struct orig_bio_details *orig_bio_details,
- unsigned int nr_sectors)
-{
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
- /* The clone BIO may already have been completed and failed */
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
- return BLK_STS_IOERR;
-
- /* Update the zone wp offset */
- switch (orig_bio_details->op) {
- case REQ_OP_ZONE_RESET:
- WRITE_ONCE(md->zwp_offset[zno], 0);
- return BLK_STS_OK;
- case REQ_OP_ZONE_FINISH:
- WRITE_ONCE(md->zwp_offset[zno],
- bdev_zone_sectors(md->disk->part0));
- return BLK_STS_OK;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- case REQ_OP_ZONE_APPEND:
- /*
- * Check that the target did not truncate the write operation
- * emulating a zone append.
- */
- if (nr_sectors != orig_bio_details->nr_sectors) {
- DMWARN_LIMIT("Truncated write for zone append");
- return BLK_STS_IOERR;
- }
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return BLK_STS_IOERR;
- }
-}
-
-static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
- return;
-
- wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
- bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
-
- WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
- clear_bit_unlock(zno, disk->seq_zones_wlock);
- smp_mb__after_atomic();
- wake_up_bit(disk->seq_zones_wlock, zno);
-
- bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static bool dm_need_zone_wp_tracking(struct bio *bio)
-{
/*
- * Special processing is not needed for operations that do not need the
- * zone write lock, that is, all operations that target conventional
- * zones and all operations that do not modify directly a sequential
- * zone write pointer.
+ * Check that the mapped device will indeed be zoned, that is, that it
+ * has sequential write required zones.
*/
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
- return false;
- switch (bio_op(bio)) {
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- case REQ_OP_ZONE_APPEND:
- return bio_zone_is_seq(bio);
- default:
- return false;
- }
-}
-
-/*
- * Special IO mapping for targets needing zone append emulation.
- */
-int dm_zone_map_bio(struct dm_target_io *tio)
-{
- struct dm_io *io = tio->io;
- struct dm_target *ti = tio->ti;
- struct mapped_device *md = io->md;
- struct bio *clone = &tio->clone;
- struct orig_bio_details orig_bio_details;
- unsigned int zno;
- blk_status_t sts;
- int r;
-
- /*
- * IOs that do not change a zone write pointer do not need
- * any additional special processing.
- */
- if (!dm_need_zone_wp_tracking(clone))
- return ti->type->map(ti, clone);
-
- /* Lock the target zone */
- zno = bio_zone_no(clone);
- dm_zone_lock(md->disk, zno, clone);
-
- orig_bio_details.nr_sectors = bio_sectors(clone);
- orig_bio_details.op = bio_op(clone);
+ ret = dm_check_zoned(md, t);
+ if (ret)
+ return ret;
+ if (!blk_queue_is_zoned(q))
+ return 0;
- /*
- * Check that the bio and the target zone write pointer offset are
- * both valid, and if the bio is a zone append, remap it to a write.
- */
- if (!dm_zone_map_bio_begin(md, zno, clone)) {
- dm_zone_unlock(md->disk, zno, clone);
- return DM_MAPIO_KILL;
+ if (!md->disk->nr_zones) {
+ DMINFO("%s using %s zone append",
+ md->disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
}
- /* Let the target do its work */
- r = ti->type->map(ti, clone);
- switch (r) {
- case DM_MAPIO_SUBMITTED:
- /*
- * The target submitted the clone BIO. The target zone will
- * be unlocked on completion of the clone.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- break;
- case DM_MAPIO_REMAPPED:
- /*
- * The target only remapped the clone BIO. In case of error,
- * unlock the target zone here as the clone will not be
- * submitted.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- if (sts != BLK_STS_OK)
- dm_zone_unlock(md->disk, zno, clone);
- break;
- case DM_MAPIO_REQUEUE:
- case DM_MAPIO_KILL:
- default:
- dm_zone_unlock(md->disk, zno, clone);
- sts = BLK_STS_IOERR;
- break;
- }
-
- if (sts != BLK_STS_OK)
- return DM_MAPIO_KILL;
-
- return r;
+ return dm_revalidate_zones(md, t);
}
/*
@@ -587,61 +298,17 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
struct mapped_device *md = io->md;
struct gendisk *disk = md->disk;
struct bio *orig_bio = io->orig_bio;
- unsigned int zwp_offset;
- unsigned int zno;
/*
- * For targets that do not emulate zone append, we only need to
- * handle native zone-append bios.
+ * Get the offset within the zone of the written sector
+ * and add that to the original bio sector position.
*/
- if (!dm_emulate_zone_append(md)) {
- /*
- * Get the offset within the zone of the written sector
- * and add that to the original bio sector position.
- */
- if (clone->bi_status == BLK_STS_OK &&
- bio_op(clone) == REQ_OP_ZONE_APPEND) {
- sector_t mask =
- (sector_t)bdev_zone_sectors(disk->part0) - 1;
-
- orig_bio->bi_iter.bi_sector +=
- clone->bi_iter.bi_sector & mask;
- }
-
- return;
- }
+ if (clone->bi_status == BLK_STS_OK &&
+ bio_op(clone) == REQ_OP_ZONE_APPEND) {
+ sector_t mask = bdev_zone_sectors(disk->part0) - 1;
- /*
- * For targets that do emulate zone append, if the clone BIO does not
- * own the target zone write lock, we have nothing to do.
- */
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
-
- zno = bio_zone_no(orig_bio);
-
- if (clone->bi_status != BLK_STS_OK) {
- /*
- * BIOs that modify a zone write pointer may leave the zone
- * in an unknown state in case of failure (e.g. the write
- * pointer was only partially advanced). In this case, set
- * the target zone write pointer as invalid unless it is
- * already being updated.
- */
- WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
- } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
- /*
- * Get the written sector for zone append operation that were
- * emulated using regular write operations.
- */
- zwp_offset = READ_ONCE(md->zwp_offset[zno]);
- if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
- WRITE_ONCE(md->zwp_offset[zno],
- DM_ZONE_INVALID_WP_OFST);
- else
- orig_bio->bi_iter.bi_sector +=
- zwp_offset - bio_sectors(orig_bio);
+ orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
}
- dm_zone_unlock(disk, zno, clone);
+ return;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d0746b37c8ec7..597dd7a258234a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1428,25 +1428,12 @@ static void __map_bio(struct bio *clone)
down(&md->swap_bios_semaphore);
}
- if (static_branch_unlikely(&zoned_enabled)) {
- /*
- * Check if the IO needs a special mapping due to zone append
- * emulation on zoned target. In this case, dm_zone_map_bio()
- * calls the target map operation.
- */
- if (unlikely(dm_emulate_zone_append(md)))
- r = dm_zone_map_bio(tio);
- else
- goto do_map;
- } else {
-do_map:
- if (likely(ti->type->map == linear_map))
- r = linear_map(ti, clone);
- else if (ti->type->map == stripe_map)
- r = stripe_map(ti, clone);
- else
- r = ti->type->map(ti, clone);
- }
+ if (likely(ti->type->map == linear_map))
+ r = linear_map(ti, clone);
+ else if (ti->type->map == stripe_map)
+ r = stripe_map(ti, clone);
+ else
+ r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
@@ -1774,6 +1761,33 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
ci->sector_count = 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ /*
+ * For mapped device that need zone append emulation, we must
+ * split any large BIO that straddles zone boundaries.
+ */
+ return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+ !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+}
+#else
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ return false;
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return false;
+}
+#endif
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
@@ -1783,19 +1797,32 @@ static void dm_split_and_process_bio(struct mapped_device *md,
struct clone_info ci;
struct dm_io *io;
blk_status_t error = BLK_STS_OK;
- bool is_abnormal;
+ bool is_abnormal, need_split;
+
+ need_split = is_abnormal = is_abnormal_io(bio);
+ if (static_branch_unlikely(&zoned_enabled))
+ need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
- is_abnormal = is_abnormal_io(bio);
- if (unlikely(is_abnormal)) {
+ if (unlikely(need_split)) {
/*
* Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
+ * Also split the BIO for mapped devices needing zone append
+ * emulation to ensure that the BIO does not cross zone
+ * boundaries.
*/
bio = bio_split_to_limits(bio);
if (!bio)
return;
}
+ /*
+ * Use the block layer zone write plugging for mapped devices that
+ * need zone append emulation (e.g. dm-crypt).
+ */
+ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+ return;
+
/* Only support nowait for normal IO */
if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
io = alloc_io(md, bio, GFP_NOWAIT);
@@ -2016,7 +2043,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->dax_dev = NULL;
}
- dm_cleanup_zoned_dev(md);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7f1acbf6bd9ec4..e0c57f19839b29 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -104,13 +104,11 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
-void dm_cleanup_zoned_dev(struct mapped_device *md);
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_map_bio(struct dm_target_io *io);
#else
-static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e575e74aabf5ef..00bbafcd27bbdd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8087,7 +8087,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
if (t) {
pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
set_bit(THREAD_WAKEUP, &t->flags);
- wake_up(&t->wqueue);
+ if (wq_has_sleeper(&t->wqueue))
+ wake_up(&t->wqueue);
}
rcu_read_unlock();
}
@@ -8576,14 +8577,19 @@ static int is_mddev_idle(struct mddev *mddev, int init)
{
struct md_rdev *rdev;
int idle;
- int curr_events;
+ long long curr_events;
idle = 1;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_disk;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
+
+ if (!init && !blk_queue_io_stat(disk->queue))
+ continue;
+
+ curr_events =
+ (long long)part_stat_read_accum(disk->part0, sectors) -
+ atomic64_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
* disk_stats is counted when it completes.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 097d9dbd69b836..029dd0491a3661 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -51,7 +51,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ long long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -621,7 +621,8 @@ extern void mddev_unlock(struct mddev *mddev);
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
+ if (blk_queue_io_stat(bdev->bd_disk->queue))
+ atomic64_add(nr_sectors, &bdev->bd_disk->sync_io);
}
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d874abfc18364e..2bd1ce9b39226a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,7 +36,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -6734,6 +6733,9 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;
unsigned int offset;
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ break;
+
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);
@@ -6770,18 +6772,7 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
-
- /*
- * Waiting on MD_SB_CHANGE_PENDING below may deadlock
- * seeing md_check_recovery() is needed to clear
- * the flag when using mdmon.
- */
- continue;
}
-
- wait_event_lock_irq(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
- conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 27281a9a8951db..8ae0a2dc5eda71 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2153,7 +2153,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = blk_revalidate_disk_zones(ns->disk, NULL);
+ ret = blk_revalidate_disk_zones(ns->disk);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 3dfd5ae99ae05e..499a8bb7cac7d1 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -423,13 +423,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * For iopoll, complete it directly.
+ * For iopoll, complete it directly. Note that using the uring_cmd
+ * helper for this is safe only because we check blk_rq_is_poll().
+ * As that returns false if we're NOT on a polled queue, then it's
+ * safe to use the polled completion helper.
+ *
* Otherwise, move the completion to task work.
*/
- if (blk_rq_is_poll(req))
- nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
- else
+ if (blk_rq_is_poll(req)) {
+ if (pdu->bio)
+ blk_rq_unmap_user(pdu->bio);
+ io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
+ } else {
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ }
return RQ_END_IO_FREE;
}
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 3148d9f1bde66a..0021d06041c1e5 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -52,14 +52,10 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
return false;
/*
- * ZNS does not define a conventional zone type. If the underlying
- * device has a bitmap set indicating the existence of conventional
- * zones, reject the device. Otherwise, use report zones to detect if
- * the device has conventional zones.
+ * ZNS does not define a conventional zone type. Use report zones
+ * to detect if the device has conventional zones and reject it if
+ * it does.
*/
- if (ns->bdev->bd_disk->conv_zones_bitmap)
- return false;
-
ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
validate_conv_zones_cb, NULL);
if (ret < 0)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5b3230ef51fe61..967b6d62bb37e1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1869,7 +1869,6 @@ out_put_budget:
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
- case BLK_STS_ZONE_RESOURCE:
if (scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 65cdc8b77e3585..64c5129044b305 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1260,12 +1260,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
}
}
- if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
- if (ret)
- goto fail;
- }
-
fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
dix = scsi_prot_sg_count(cmd);
dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
@@ -1348,7 +1342,6 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
return sd_setup_flush_cmnd(cmd);
case REQ_OP_READ:
case REQ_OP_WRITE:
- case REQ_OP_ZONE_APPEND:
return sd_setup_read_write_cmnd(cmd);
case REQ_OP_ZONE_RESET:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
@@ -3981,7 +3974,6 @@ static void scsi_disk_release(struct device *dev)
struct scsi_disk *sdkp = to_scsi_disk(dev);
ida_free(&sd_index_ida, sdkp->index);
- sd_zbc_free_zone_info(sdkp);
put_device(&sdkp->device->sdev_gendev);
free_opal_dev(sdkp->opal_dev);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5c4285a582b220..49dd600bfa4825 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -104,12 +104,6 @@ struct scsi_disk {
* between zone starting LBAs is constant.
*/
u32 zone_starting_lba_gran;
- u32 *zones_wp_offset;
- spinlock_t zones_wp_offset_lock;
- u32 *rev_wp_offset;
- struct mutex rev_mutex;
- struct work_struct zone_wp_offset_work;
- char *zone_wp_update_buf;
#endif
atomic_t openers;
sector_t capacity; /* size in logical blocks */
@@ -245,7 +239,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
#ifdef CONFIG_BLK_DEV_ZONED
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp);
int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]);
int sd_zbc_revalidate_zones(struct scsi_disk *sdkp);
blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
@@ -255,13 +248,8 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
- unsigned int nr_blocks);
-
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {}
-
static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
{
return 0;
@@ -285,13 +273,6 @@ static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
return good_bytes;
}
-static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd,
- sector_t *lba,
- unsigned int nr_blocks)
-{
- return BLK_STS_TARGET;
-}
-
#define sd_zbc_report_zones NULL
#endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 26af5ab7d7c173..806036e48abeda 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -23,36 +23,6 @@
#define CREATE_TRACE_POINTS
#include "sd_trace.h"
-/**
- * sd_zbc_get_zone_wp_offset - Get zone write pointer offset.
- * @zone: Zone for which to return the write pointer offset.
- *
- * Return: offset of the write pointer from the start of the zone.
- */
-static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone)
-{
- if (zone->type == ZBC_ZONE_TYPE_CONV)
- return 0;
-
- switch (zone->cond) {
- case BLK_ZONE_COND_IMP_OPEN:
- case BLK_ZONE_COND_EXP_OPEN:
- case BLK_ZONE_COND_CLOSED:
- return zone->wp - zone->start;
- case BLK_ZONE_COND_FULL:
- return zone->len;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- default:
- /*
- * Offline and read-only zones do not have a valid
- * write pointer. Use 0 as for an empty zone.
- */
- return 0;
- }
-}
-
/* Whether or not a SCSI zone descriptor describes a gap zone. */
static bool sd_zbc_is_gap_zone(const u8 buf[64])
{
@@ -121,9 +91,6 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
if (ret)
return ret;
- if (sdkp->rev_wp_offset)
- sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);
-
return 0;
}
@@ -347,123 +314,6 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd)
return BLK_STS_OK;
}
-#define SD_ZBC_INVALID_WP_OFST (~0u)
-#define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1)
-
-static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- struct scsi_disk *sdkp = data;
-
- lockdep_assert_held(&sdkp->zones_wp_offset_lock);
-
- sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone);
-
- return 0;
-}
-
-/*
- * An attempt to append a zone triggered an invalid write pointer error.
- * Reread the write pointer of the zone(s) in which the append failed.
- */
-static void sd_zbc_update_wp_offset_workfn(struct work_struct *work)
-{
- struct scsi_disk *sdkp;
- unsigned long flags;
- sector_t zno;
- int ret;
-
- sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work);
-
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) {
- if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
- continue;
-
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
- ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf,
- SD_BUF_SIZE,
- zno * sdkp->zone_info.zone_blocks, true);
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- if (!ret)
- sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64,
- zno, sd_zbc_update_wp_offset_cb,
- sdkp);
- }
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
- scsi_device_put(sdkp->device);
-}
-
-/**
- * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command.
- * @cmd: the command to setup
- * @lba: the LBA to patch
- * @nr_blocks: the number of LBAs to be written
- *
- * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND.
- * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and
- * patching of the lba for an emulated ZONE_APPEND command.
- *
- * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will
- * schedule a REPORT ZONES command and return BLK_STS_IOERR.
- */
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
- unsigned int nr_blocks)
-{
- struct request *rq = scsi_cmd_to_rq(cmd);
- struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
- unsigned int wp_offset, zno = blk_rq_zone_no(rq);
- unsigned long flags;
- blk_status_t ret;
-
- ret = sd_zbc_cmnd_checks(cmd);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (!blk_rq_zone_is_seq(rq))
- return BLK_STS_IOERR;
-
- /* Unlock of the write lock will happen in sd_zbc_complete() */
- if (!blk_req_zone_write_trylock(rq))
- return BLK_STS_ZONE_RESOURCE;
-
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- wp_offset = sdkp->zones_wp_offset[zno];
- switch (wp_offset) {
- case SD_ZBC_INVALID_WP_OFST:
- /*
- * We are about to schedule work to update a zone write pointer
- * offset, which will cause the zone append command to be
- * requeued. So make sure that the scsi device does not go away
- * while the work is being processed.
- */
- if (scsi_device_get(sdkp->device)) {
- ret = BLK_STS_IOERR;
- break;
- }
- sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST;
- schedule_work(&sdkp->zone_wp_offset_work);
- fallthrough;
- case SD_ZBC_UPDATING_WP_OFST:
- ret = BLK_STS_DEV_RESOURCE;
- break;
- default:
- wp_offset = sectors_to_logical(sdkp->device, wp_offset);
- if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) {
- ret = BLK_STS_IOERR;
- break;
- }
-
- trace_scsi_prepare_zone_append(cmd, *lba, wp_offset);
- *lba += wp_offset;
- }
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
- if (ret)
- blk_req_zone_write_unlock(rq);
- return ret;
-}
-
/**
* sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations
* can be RESET WRITE POINTER, OPEN, CLOSE or FINISH.
@@ -504,96 +354,6 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
return BLK_STS_OK;
}
-static bool sd_zbc_need_zone_wp_update(struct request *rq)
-{
- switch (req_op(rq)) {
- case REQ_OP_ZONE_APPEND:
- case REQ_OP_ZONE_FINISH:
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_RESET_ALL:
- return true;
- case REQ_OP_WRITE:
- case REQ_OP_WRITE_ZEROES:
- return blk_rq_zone_is_seq(rq);
- default:
- return false;
- }
-}
-
-/**
- * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion
- * @cmd: Completed command
- * @good_bytes: Command reply bytes
- *
- * Called from sd_zbc_complete() to handle the update of the cached zone write
- * pointer value in case an update is needed.
- */
-static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
- unsigned int good_bytes)
-{
- int result = cmd->result;
- struct request *rq = scsi_cmd_to_rq(cmd);
- struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
- unsigned int zno = blk_rq_zone_no(rq);
- enum req_op op = req_op(rq);
- unsigned long flags;
-
- /*
- * If we got an error for a command that needs updating the write
- * pointer offset cache, we must mark the zone wp offset entry as
- * invalid to force an update from disk the next time a zone append
- * command is issued.
- */
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-
- if (result && op != REQ_OP_ZONE_RESET_ALL) {
- if (op == REQ_OP_ZONE_APPEND) {
- /* Force complete completion (no retry) */
- good_bytes = 0;
- scsi_set_resid(cmd, blk_rq_bytes(rq));
- }
-
- /*
- * Force an update of the zone write pointer offset on
- * the next zone append access.
- */
- if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
- sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
- goto unlock_wp_offset;
- }
-
- switch (op) {
- case REQ_OP_ZONE_APPEND:
- trace_scsi_zone_wp_update(cmd, rq->__sector,
- sdkp->zones_wp_offset[zno], good_bytes);
- rq->__sector += sdkp->zones_wp_offset[zno];
- fallthrough;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
- sdkp->zones_wp_offset[zno] +=
- good_bytes >> SECTOR_SHIFT;
- break;
- case REQ_OP_ZONE_RESET:
- sdkp->zones_wp_offset[zno] = 0;
- break;
- case REQ_OP_ZONE_FINISH:
- sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
- break;
- case REQ_OP_ZONE_RESET_ALL:
- memset(sdkp->zones_wp_offset, 0,
- sdkp->zone_info.nr_zones * sizeof(unsigned int));
- break;
- default:
- break;
- }
-
-unlock_wp_offset:
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
- return good_bytes;
-}
-
/**
* sd_zbc_complete - ZBC command post processing.
* @cmd: Completed command
@@ -619,11 +379,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
* so be quiet about the error.
*/
rq->rq_flags |= RQF_QUIET;
- } else if (sd_zbc_need_zone_wp_update(rq))
- good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
-
- if (req_op(rq) == REQ_OP_ZONE_APPEND)
- blk_req_zone_write_unlock(rq);
+ }
return good_bytes;
}
@@ -780,46 +536,6 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp)
sdkp->zone_info.zone_blocks);
}
-static int sd_zbc_init_disk(struct scsi_disk *sdkp)
-{
- sdkp->zones_wp_offset = NULL;
- spin_lock_init(&sdkp->zones_wp_offset_lock);
- sdkp->rev_wp_offset = NULL;
- mutex_init(&sdkp->rev_mutex);
- INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn);
- sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL);
- if (!sdkp->zone_wp_update_buf)
- return -ENOMEM;
-
- return 0;
-}
-
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp)
-{
- if (!sdkp->zone_wp_update_buf)
- return;
-
- /* Serialize against revalidate zones */
- mutex_lock(&sdkp->rev_mutex);
-
- kvfree(sdkp->zones_wp_offset);
- sdkp->zones_wp_offset = NULL;
- kfree(sdkp->zone_wp_update_buf);
- sdkp->zone_wp_update_buf = NULL;
-
- sdkp->early_zone_info = (struct zoned_disk_info){ };
- sdkp->zone_info = (struct zoned_disk_info){ };
-
- mutex_unlock(&sdkp->rev_mutex);
-}
-
-static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
-{
- struct scsi_disk *sdkp = scsi_disk(disk);
-
- swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
-}
-
/*
* Call blk_revalidate_disk_zones() if any of the zoned disk properties have
* changed that make it necessary to call that function. Called by
@@ -831,18 +547,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
struct request_queue *q = disk->queue;
u32 zone_blocks = sdkp->early_zone_info.zone_blocks;
unsigned int nr_zones = sdkp->early_zone_info.nr_zones;
- int ret = 0;
unsigned int flags;
-
- /*
- * For all zoned disks, initialize zone append emulation data if not
- * already done.
- */
- if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) {
- ret = sd_zbc_init_disk(sdkp);
- if (ret)
- return ret;
- }
+ int ret;
/*
* There is nothing to do for regular disks, including host-aware disks
@@ -851,50 +557,32 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
if (!blk_queue_is_zoned(q))
return 0;
- /*
- * Make sure revalidate zones are serialized to ensure exclusive
- * updates of the scsi disk data.
- */
- mutex_lock(&sdkp->rev_mutex);
-
if (sdkp->zone_info.zone_blocks == zone_blocks &&
sdkp->zone_info.nr_zones == nr_zones &&
disk->nr_zones == nr_zones)
- goto unlock;
+ return 0;
- flags = memalloc_noio_save();
sdkp->zone_info.zone_blocks = zone_blocks;
sdkp->zone_info.nr_zones = nr_zones;
- sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL);
- if (!sdkp->rev_wp_offset) {
- ret = -ENOMEM;
- memalloc_noio_restore(flags);
- goto unlock;
- }
blk_queue_chunk_sectors(q,
logical_to_sectors(sdkp->device, zone_blocks));
- blk_queue_max_zone_append_sectors(q,
- q->limits.max_segments << PAGE_SECTORS_SHIFT);
- ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
+ /* Enable block layer zone append emulation */
+ blk_queue_max_zone_append_sectors(q, 0);
+ flags = memalloc_noio_save();
+ ret = blk_revalidate_disk_zones(disk);
memalloc_noio_restore(flags);
- kvfree(sdkp->rev_wp_offset);
- sdkp->rev_wp_offset = NULL;
-
if (ret) {
sdkp->zone_info = (struct zoned_disk_info){ };
sdkp->capacity = 0;
- goto unlock;
+ return ret;
}
sd_zbc_print_zones(sdkp);
-unlock:
- mutex_unlock(&sdkp->rev_mutex);
-
- return ret;
+ return 0;
}
/**
@@ -917,10 +605,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
if (!sd_is_zoned(sdkp)) {
/*
* Device managed or normal SCSI disk, no special handling
- * required. Nevertheless, free the disk zone information in
- * case the device type changed.
+ * required.
*/
- sd_zbc_free_zone_info(sdkp);
return 0;
}
@@ -941,7 +627,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
if (sdkp->zones_max_open == U32_MAX)
disk_set_max_open_zones(disk, 0);
else
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6f4a9cfeea44a3..831fac45e70f7d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -331,12 +331,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
static void merge_rbio(struct btrfs_raid_bio *dest,
struct btrfs_raid_bio *victim)
{
- bio_list_merge(&dest->bio_list, &victim->bio_list);
+ bio_list_merge_init(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- bio_list_init(&victim->bio_list);
}
/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 875d792bffff82..9b8a369f44bc6b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -615,6 +615,13 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
bl->tail = bl2->tail;
}
+static inline void bio_list_merge_init(struct bio_list *bl,
+ struct bio_list *bl2)
+{
+ bio_list_merge(bl, bl2);
+ bio_list_init(bl2);
+}
+
static inline void bio_list_merge_head(struct bio_list *bl,
struct bio_list *bl2)
{
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b61..89ba6b16fe8b10 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -54,8 +54,8 @@ typedef __u32 __bitwise req_flags_t;
/* Look at ->special_vec for the actual data payload instead of the
bio chain. */
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+/* The request completion needs to be signaled to zone write pluging. */
+#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
#define RQF_RESV ((__force req_flags_t)(1 << 23))
@@ -1150,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
void blk_dump_rq_flags(struct request *, char *);
-#ifdef CONFIG_BLK_DEV_ZONED
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
- return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
- return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
-}
-
-/**
- * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
- * @rq: Request to examine.
- *
- * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
- */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
- return op_needs_zoned_write_locking(req_op(rq)) &&
- blk_rq_zone_is_seq(rq);
-}
-
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
- if (blk_req_needs_zone_write_lock(rq))
- __blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
- if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
- __blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return rq->q->disk->seq_zones_wlock &&
- test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- if (!blk_req_needs_zone_write_lock(rq))
- return true;
- return !blk_req_zone_is_write_locked(rq);
-}
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
- return false;
-}
-
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
#endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6438c75cbb358f..e8f30ab600088f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -138,25 +138,13 @@ typedef u16 blk_short_t;
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
/*
- * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
- * related resources are unavailable, but the driver can guarantee the queue
- * will be rerun in the future once the resources become available again.
- *
- * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
- * a zone specific resource and IO to a different zone on the same device could
- * still be served. Examples of that are zones that are write-locked, but a read
- * to the same zone could be served.
- */
-#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
-
-/*
* BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
* path if the device returns a status indicating that too many zone resources
* are currently open. The same command should be successful if resubmitted
* after the number of open zones decreases below the device's limits, which is
* reported in the request_queue's max_open_zones.
*/
-#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15)
+#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14)
/*
* BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@ -165,20 +153,20 @@ typedef u16 blk_short_t;
* after the number of active zones decreases below the device's limits, which
* is reported in the request_queue's max_active_zones.
*/
-#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16)
+#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15)
/*
* BLK_STS_OFFLINE is returned from the driver when the target device is offline
* or is being taken offline. This could help differentiate the case where a
* device is intentionally being shut down from a real I/O error.
*/
-#define BLK_STS_OFFLINE ((__force blk_status_t)17)
+#define BLK_STS_OFFLINE ((__force blk_status_t)16)
/*
* BLK_STS_DURATION_LIMIT is returned from the driver when the target device
* aborted the command because it exceeded one of its Command Duration Limits.
*/
-#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18)
+#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17)
/**
* blk_path_error - returns true if error may be path related
@@ -235,7 +223,12 @@ struct bio {
struct bvec_iter bi_iter;
- blk_qc_t bi_cookie;
+ union {
+ /* for polled bios: */
+ blk_qc_t bi_cookie;
+ /* for plugged zoned writes only: */
+ unsigned int __bi_nr_segments;
+ };
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
@@ -305,7 +298,8 @@ enum {
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
- BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
BIO_FLAG_LAST
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d020541cd33e1c..c176b05901f78d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -174,27 +174,26 @@ struct gendisk {
struct list_head slave_bdevs;
#endif
struct timer_rand_state *random;
- atomic_t sync_io; /* RAID */
+ atomic64_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_ZONED
/*
- * Zoned block device information for request dispatch control.
- * nr_zones is the total number of zones of the device. This is always
- * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
- * bits which indicates if a zone is conventional (bit set) or
- * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
- * bits which indicates if a zone is write locked, that is, if a write
- * request targeting the zone was dispatched.
- *
- * Reads of this information must be protected with blk_queue_enter() /
- * blk_queue_exit(). Modifying this information is only allowed while
- * no requests are being processed. See also blk_mq_freeze_queue() and
- * blk_mq_unfreeze_queue().
+ * Zoned block device information. Reads of this information must be
+ * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+ * information is only allowed while no requests are being processed.
+ * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
*/
unsigned int nr_zones;
+ unsigned int zone_capacity;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
+ unsigned int zone_wplugs_hash_bits;
+ spinlock_t zone_wplugs_lock;
+ struct mempool_s *zone_wplugs_pool;
+ struct hlist_head *zone_wplugs_hash;
+ struct list_head zone_wplugs_err_list;
+ struct work_struct zone_wplugs_work;
+ struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
#if IS_ENABLED(CONFIG_CDROM)
@@ -326,8 +325,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sectors, sector_t nr_sectors);
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk));
+int blk_revalidate_disk_zones(struct gendisk *disk);
/*
* Independent access ranges: struct blk_independent_access_range describes
@@ -444,8 +442,6 @@ struct request_queue {
atomic_t nr_active_requests_shared_tags;
- unsigned int required_elevator_features;
-
struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
@@ -628,15 +624,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
return sector >> ilog2(disk->queue->limits.chunk_sectors);
}
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
- if (!blk_queue_is_zoned(disk->queue))
- return false;
- if (!disk->conv_zones_bitmap)
- return true;
- return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
-}
-
static inline void disk_set_max_open_zones(struct gendisk *disk,
unsigned int max_open_zones)
{
@@ -659,6 +646,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
return bdev->bd_disk->queue->limits.max_active_zones;
}
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
@@ -669,10 +657,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
return 0;
}
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
- return false;
-}
static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
return 0;
@@ -686,6 +670,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
return 0;
}
+static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ return false;
+}
#endif /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_depth(struct request_queue *q)
@@ -850,9 +838,11 @@ static inline unsigned int bio_zone_no(struct bio *bio)
return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}
-static inline unsigned int bio_zone_is_seq(struct bio *bio)
+static inline bool bio_straddles_zones(struct bio *bio)
{
- return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+ return bio_sectors(bio) &&
+ bio_zone_no(bio) !=
+ disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}
/*
@@ -937,14 +927,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
void disk_set_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *iars);
-/*
- * Elevator features for blk_queue_required_elevator_features:
- */
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
-
-extern void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features);
extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
struct device *dev);
@@ -1151,12 +1133,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
return q->limits.max_segment_size;
}
-static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
+{
+ unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
+
+ return min_not_zero(l->max_zone_append_sectors, max_sectors);
+}
+
+static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
{
+ if (!blk_queue_is_zoned(q))
+ return 0;
- const struct queue_limits *l = &q->limits;
+ return queue_limits_max_zone_append_sectors(&q->limits);
+}
- return min(l->max_zone_append_sectors, l->max_sectors);
+static inline bool queue_emulates_zone_append(struct request_queue *q)
+{
+ return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+}
+
+static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+{
+ return queue_emulates_zone_append(bdev_get_queue(bdev));
}
static inline unsigned int
@@ -1298,18 +1297,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
return disk_zone_no(bdev->bd_disk, sec);
}
-/* Whether write serialization is required for @op on zoned devices. */
-static inline bool op_needs_zoned_write_locking(enum req_op op)
-{
- return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
-}
-
-static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- enum req_op op)
-{
- return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
-}
-
static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1325,6 +1312,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
return sector & (bdev_zone_sectors(bdev) - 1);
}
+static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+{
+ return bdev_offset_from_zone_start(bio->bi_bdev,
+ bio->bi_iter.bi_sector);
+}
+
static inline bool bdev_is_zone_start(struct block_device *bdev,
sector_t sector)
{
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 68ed6697fece32..e123d5e17b5261 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -11,7 +11,6 @@ void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
bool io_is_uring_fops(struct file *file);
static inline void io_uring_files_cancel(void)
@@ -45,11 +44,6 @@ static inline const char *io_uring_get_opcode(u8 opcode)
{
return "";
}
-static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
static inline bool io_is_uring_fops(struct file *file)
{
return false;
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index e453a997c06053..447fbfd322154a 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -26,12 +26,25 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
+
+/*
+ * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
+ * and the corresponding io_uring request.
+ *
+ * Note: the caller should never hard code @issue_flags and is only allowed
+ * to pass the mask provided by the core io_uring code.
+ */
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags);
+
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags);
+/*
+ * Note: the caller should never hard code @issue_flags and only use the
+ * mask provided by the core io_uring code.
+ */
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
@@ -56,6 +69,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
}
#endif
+/*
+ * Polled completions must ensure they are coming from a poll queue, and
+ * hence are completed inside the usual poll handling loops.
+ */
+static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd,
+ ssize_t ret, ssize_t res2)
+{
+ lockdep_assert(in_task());
+ io_uring_cmd_done(ioucmd, ret, res2, 0);
+}
+
/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
new file mode 100644
index 00000000000000..b58f39fed4d539
--- /dev/null
+++ b/include/linux/io_uring/net.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_NET_H
+#define _LINUX_IO_URING_NET_H
+
+struct io_uring_cmd;
+
+#if defined(CONFIG_IO_URING)
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+
+#else
+static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ac333ea81d3195..7a6b190c7da74a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -205,6 +205,7 @@ struct io_submit_state {
bool plug_started;
bool need_plug;
+ bool cq_flush;
unsigned short submit_nr;
unsigned int cqes_count;
struct blk_plug plug;
@@ -219,7 +220,7 @@ struct io_ev_fd {
};
struct io_alloc_cache {
- struct io_wq_work_node list;
+ void **entries;
unsigned int nr_cached;
unsigned int max_cached;
size_t elem_size;
@@ -299,6 +300,8 @@ struct io_ring_ctx {
struct io_hash_table cancel_table_locked;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
+ struct io_alloc_cache rw_cache;
+ struct io_alloc_cache uring_cache;
/*
* Any cancelable uring_cmd is added to this list in
@@ -341,14 +344,8 @@ struct io_ring_ctx {
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
- struct io_uring_cqe completion_cqes[16];
-
spinlock_t completion_lock;
- /* IRQ completion list, under ->completion_lock */
- unsigned int locked_free_nr;
- struct io_wq_work_list locked_free_list;
-
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
@@ -371,9 +368,6 @@ struct io_ring_ctx {
struct list_head io_buffers_cache;
- /* deferred free list, protected by ->uring_lock */
- struct hlist_head io_buf_list;
-
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
@@ -438,8 +432,6 @@ struct io_ring_ctx {
};
struct io_tw_state {
- /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
- bool locked;
};
enum {
@@ -480,6 +472,7 @@ enum {
REQ_F_CAN_POLL_BIT,
REQ_F_BL_EMPTY_BIT,
REQ_F_BL_NO_RECYCLE_BIT,
+ REQ_F_BUFFERS_COMMIT_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -558,6 +551,8 @@ enum {
REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
/* don't recycle provided buffers for this request */
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
+ /* buffer ring head needs incrementing on put */
+ REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02bc8..f093cb2300d96e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -115,7 +115,7 @@ struct io_uring_sqe {
*/
#define IORING_FILE_INDEX_ALLOC (~0U)
-enum {
+enum io_uring_sqe_flags_bit {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
IOSQE_IO_LINK_BIT,
@@ -351,11 +351,20 @@ enum io_uring_op {
* 0 is reported if zerocopy was actually possible.
* IORING_NOTIF_USAGE_ZC_COPIED if data was copied
* (at least partially).
+ *
+ * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send or
+ * recv will grab as many buffers from the buffer
+ * group ID given and send them all. The completion
+ * result will be the number of buffers send, with
+ * the starting buffer ID in cqe->flags as per
+ * usual for provided buffer usage. The buffers
+ * will be contigious from the starting buffer ID.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
#define IORING_SEND_ZC_REPORT_USAGE (1U << 3)
+#define IORING_RECVSEND_BUNDLE (1U << 4)
/*
* cqe.res for IORING_CQE_F_NOTIF if
@@ -374,7 +383,7 @@ enum io_uring_op {
/*
* IORING_OP_MSG_RING command types, stored in sqe->addr
*/
-enum {
+enum io_uring_msg_ring_flags {
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
};
@@ -425,9 +434,7 @@ struct io_uring_cqe {
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
-enum {
- IORING_CQE_BUFFER_SHIFT = 16,
-};
+#define IORING_CQE_BUFFER_SHIFT 16
/*
* Magic offsets for the application to mmap the data it needs
@@ -522,11 +529,12 @@ struct io_uring_params {
#define IORING_FEAT_CQE_SKIP (1U << 11)
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
+#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
/*
* io_uring_register(2) opcodes and arguments
*/
-enum {
+enum io_uring_register_op {
IORING_REGISTER_BUFFERS = 0,
IORING_UNREGISTER_BUFFERS = 1,
IORING_REGISTER_FILES = 2,
@@ -583,7 +591,7 @@ enum {
};
/* io-wq worker categories */
-enum {
+enum io_wq_type {
IO_WQ_BOUND,
IO_WQ_UNBOUND,
};
@@ -688,7 +696,7 @@ struct io_uring_buf_ring {
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
-enum {
+enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
};
@@ -719,7 +727,7 @@ struct io_uring_napi {
/*
* io_uring_restriction->opcode values
*/
-enum {
+enum io_uring_register_restriction_op {
/* Allow an io_uring_register(2) opcode */
IORING_RESTRICTION_REGISTER_OP = 0,
@@ -775,7 +783,7 @@ struct io_uring_recvmsg_out {
/*
* Argument for IORING_OP_URING_CMD when file is a socket
*/
-enum {
+enum io_uring_socket_op {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
SOCKET_URING_OP_GETSOCKOPT,
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2e1d4e03799c34..fc1b23c524e83b 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,13 +2,14 @@
#
# Makefile for io_uring
-obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
- sync.o advise.o filetable.o \
- openclose.o uring_cmd.o epoll.o \
- statx.o net.o msg_ring.o timeout.o \
- sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o register.o truncate.o
+obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
+ tctx.o filetable.o rw.o net.o poll.o \
+ uring_cmd.o openclose.o sqpoll.o \
+ xattr.o nop.o fs.o splice.o sync.o \
+ msg_ring.o advise.o openclose.o \
+ epoll.o statx.o timeout.o fdinfo.o \
+ cancel.o waitid.o register.o \
+ truncate.o memmap.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index bf2fb26a65398f..b7a38a2069cfe3 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -4,63 +4,58 @@
/*
* Don't allow the cache to grow beyond this size.
*/
-#define IO_ALLOC_CACHE_MAX 512
-
-struct io_cache_entry {
- struct io_wq_work_node node;
-};
+#define IO_ALLOC_CACHE_MAX 128
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
- struct io_cache_entry *entry)
+ void *entry)
{
if (cache->nr_cached < cache->max_cached) {
- cache->nr_cached++;
- wq_stack_add_head(&entry->node, &cache->list);
- kasan_mempool_poison_object(entry);
+ if (!kasan_mempool_poison_object(entry))
+ return false;
+ cache->entries[cache->nr_cached++] = entry;
return true;
}
return false;
}
-static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
-{
- return !cache->list.next;
-}
-
-static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
{
- if (cache->list.next) {
- struct io_cache_entry *entry;
+ if (cache->nr_cached) {
+ void *entry = cache->entries[--cache->nr_cached];
- entry = container_of(cache->list.next, struct io_cache_entry, node);
kasan_mempool_unpoison_object(entry, cache->elem_size);
- cache->list.next = cache->list.next->next;
- cache->nr_cached--;
return entry;
}
return NULL;
}
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+/* returns false if the cache was initialized properly */
+static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)
{
- cache->list.next = NULL;
- cache->nr_cached = 0;
- cache->max_cached = max_nr;
- cache->elem_size = size;
+ cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL);
+ if (cache->entries) {
+ cache->nr_cached = 0;
+ cache->max_cached = max_nr;
+ cache->elem_size = size;
+ return false;
+ }
+ return true;
}
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
- void (*free)(struct io_cache_entry *))
+ void (*free)(const void *))
{
- while (1) {
- struct io_cache_entry *entry = io_alloc_cache_get(cache);
+ void *entry;
+
+ if (!cache->entries)
+ return;
- if (!entry)
- break;
+ while ((entry = io_alloc_cache_get(cache)) != NULL)
free(entry);
- }
- cache->nr_cached = 0;
+
+ kvfree(cache->entries);
+ cache->entries = NULL;
}
#endif
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index acfcdd7f059afd..a6e58a20efdd27 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd,
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- ret = io_async_cancel_one(tctx, cd);
+ ret = io_async_cancel_one(node->task->io_uring, cd);
if (ret != -ENOENT) {
if (!all)
break;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 8d444dd1b0a7b6..b1e0e0d8534910 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
* Caller holds a reference to the file already, we don't need to do
* anything else to get an extra reference.
*/
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
{
- struct io_ring_ctx *ctx = f->private_data;
+ struct io_ring_ctx *ctx = file->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
struct rusage sq_usage;
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 792a03df58deac..914848f46beb21 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -9,7 +9,7 @@
#include "../kernel/futex/futex.h"
#include "io_uring.h"
-#include "rsrc.h"
+#include "alloc_cache.h"
#include "futex.h"
struct io_futex {
@@ -27,27 +27,21 @@ struct io_futex {
};
struct io_futex_data {
- union {
- struct futex_q q;
- struct io_cache_entry cache;
- };
+ struct futex_q q;
struct io_kiocb *req;
};
-void io_futex_cache_init(struct io_ring_ctx *ctx)
-{
- io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
- sizeof(struct io_futex_data));
-}
+#define IO_FUTEX_ALLOC_CACHE_MAX 32
-static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+bool io_futex_cache_init(struct io_ring_ctx *ctx)
{
- kfree(container_of(entry, struct io_futex_data, cache));
+ return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX,
+ sizeof(struct io_futex_data));
}
void io_futex_cache_free(struct io_ring_ctx *ctx)
{
- io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+ io_alloc_cache_free(&ctx->futex_cache, kfree);
}
static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
@@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
struct io_ring_ctx *ctx = req->ctx;
io_tw_lock(ctx, ts);
- if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+ if (!io_alloc_cache_put(&ctx->futex_cache, ifd))
kfree(ifd);
__io_futex_complete(req, ts);
}
@@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
{
- struct io_cache_entry *entry;
+ struct io_futex_data *ifd;
- entry = io_alloc_cache_get(&ctx->futex_cache);
- if (entry)
- return container_of(entry, struct io_futex_data, cache);
+ ifd = io_alloc_cache_get(&ctx->futex_cache);
+ if (ifd)
+ return ifd;
return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
}
diff --git a/io_uring/futex.h b/io_uring/futex.h
index 0847e9e8a127b2..b8bb09873d5795 100644
--- a/io_uring/futex.h
+++ b/io_uring/futex.h
@@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all);
-void io_futex_cache_init(struct io_ring_ctx *ctx);
+bool io_futex_cache_init(struct io_ring_ctx *ctx);
void io_futex_cache_free(struct io_ring_ctx *ctx);
#else
static inline int io_futex_cancel(struct io_ring_ctx *ctx,
@@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
{
return false;
}
-static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+static inline bool io_futex_cache_init(struct io_ring_ctx *ctx)
{
+ return false;
}
static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 522196dfb0ff5a..d7fc6f6d447723 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -51,7 +51,6 @@ struct io_worker {
struct io_wq *wq;
struct io_wq_work *cur_work;
- struct io_wq_work *next_work;
raw_spinlock_t lock;
struct completion ref_done;
@@ -539,7 +538,6 @@ static void io_assign_current_work(struct io_worker *worker,
raw_spin_lock(&worker->lock);
worker->cur_work = work;
- worker->next_work = NULL;
raw_spin_unlock(&worker->lock);
}
@@ -564,10 +562,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
* clear the stalled flag.
*/
work = io_get_next_work(acct, worker);
- raw_spin_unlock(&acct->lock);
if (work) {
- __io_worker_busy(wq, worker);
-
/*
* Make sure cancelation can find this, even before
* it becomes the active work. That avoids a window
@@ -576,11 +571,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
* current work item for this worker.
*/
raw_spin_lock(&worker->lock);
- worker->next_work = work;
+ worker->cur_work = work;
raw_spin_unlock(&worker->lock);
- } else {
- break;
}
+
+ raw_spin_unlock(&acct->lock);
+
+ if (!work)
+ break;
+
+ __io_worker_busy(wq, worker);
+
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -1005,8 +1006,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
* may dereference the passed in work.
*/
raw_spin_lock(&worker->lock);
- if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
- __io_wq_worker_cancel(worker, match, worker->next_work))
+ if (__io_wq_worker_cancel(worker, match, worker->cur_work))
match->nr_running++;
raw_spin_unlock(&worker->lock);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eef8a4f90d989d..86fd72f6a1c2b6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -63,7 +63,6 @@
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
-#include <linux/highmem.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
@@ -95,6 +94,8 @@
#include "waitid.h"
#include "futex.h"
#include "napi.h"
+#include "uring_cmd.h"
+#include "memmap.h"
#include "timeout.h"
#include "poll.h"
@@ -170,17 +171,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {},
};
#endif
-static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
-{
- if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
- ctx->submit_state.cqes_count)
- __io_submit_flush_completions(ctx);
-}
-
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
@@ -253,14 +246,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
fallback_work.work);
struct llist_node *node = llist_del_all(&ctx->fallback_llist);
struct io_kiocb *req, *tmp;
- struct io_tw_state ts = { .locked = true, };
+ struct io_tw_state ts = {};
percpu_ref_get(&ctx->refs);
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
req->io_task_work.func(req, &ts);
- if (WARN_ON_ONCE(!ts.locked))
- return;
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
percpu_ref_put(&ctx->refs);
@@ -284,6 +275,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
+ bool ret;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -312,14 +304,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
- INIT_HLIST_HEAD(&ctx->io_buf_list);
- io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+ ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
- io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+ ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
sizeof(struct async_poll));
- io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+ ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
- io_futex_cache_init(ctx);
+ ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct io_async_rw));
+ ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct uring_cache));
+ ret |= io_futex_cache_init(ctx);
+ if (ret)
+ goto err;
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -337,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_llist_head(&ctx->work_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
- INIT_WQ_LIST(&ctx->locked_free_list);
INIT_HLIST_HEAD(&ctx->waitid_list);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
@@ -349,6 +345,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
return ctx;
err:
+ io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
+ io_alloc_cache_free(&ctx->apoll_cache, kfree);
+ io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+ io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+ io_alloc_cache_free(&ctx->uring_cache, kfree);
+ io_futex_cache_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
xa_destroy(&ctx->io_bl_xa);
@@ -379,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
spin_lock(&req->ctx->completion_lock);
- io_put_kbuf_comp(req);
+ io_kbuf_drop(req);
spin_unlock(&req->ctx->completion_lock);
}
@@ -498,7 +500,7 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
+static void io_queue_iowq(struct io_kiocb *req)
{
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
@@ -666,28 +668,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
io_commit_cqring_flush(ctx);
}
-static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
-{
- struct io_overflow_cqe *ocqe;
- LIST_HEAD(list);
-
- spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->cq_overflow_list, &list);
- clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- spin_unlock(&ctx->completion_lock);
-
- while (!list_empty(&list)) {
- ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
- list_del(&ocqe->list);
- kfree(ocqe);
- }
-}
-
-static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
+static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{
size_t cqe_size = sizeof(struct io_uring_cqe);
- if (__io_cqring_events(ctx) == ctx->cq_entries)
+ lockdep_assert_held(&ctx->uring_lock);
+
+ /* don't abort if we're dying, entries must get freed */
+ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
return;
if (ctx->flags & IORING_SETUP_CQE32)
@@ -698,11 +686,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe;
- if (!io_get_cqe_overflow(ctx, &cqe, true))
- break;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
- memcpy(cqe, &ocqe->cqe, cqe_size);
+
+ if (!dying) {
+ if (!io_get_cqe_overflow(ctx, &cqe, true))
+ break;
+ memcpy(cqe, &ocqe->cqe, cqe_size);
+ }
list_del(&ocqe->list);
kfree(ocqe);
}
@@ -714,20 +705,17 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
io_cq_unlock_post(ctx);
}
-static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
+static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{
- /* iopoll syncs against uring_lock, not completion_lock */
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_lock(&ctx->uring_lock);
- __io_cqring_overflow_flush(ctx);
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_unlock(&ctx->uring_lock);
+ if (ctx->rings)
+ __io_cqring_overflow_flush(ctx, true);
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
+static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
{
- if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
- io_cqring_do_overflow_flush(ctx);
+ mutex_lock(&ctx->uring_lock);
+ __io_cqring_overflow_flush(ctx, false);
+ mutex_unlock(&ctx->uring_lock);
}
/* can be called by any task */
@@ -817,7 +805,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-void io_req_cqe_overflow(struct io_kiocb *req)
+static void io_req_cqe_overflow(struct io_kiocb *req)
{
io_cqring_event_overflow(req->ctx, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
@@ -890,151 +878,71 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
-static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_submit_state *state = &ctx->submit_state;
- unsigned int i;
-
- lockdep_assert_held(&ctx->uring_lock);
- for (i = 0; i < state->cqes_count; i++) {
- struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
-
- if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
- if (ctx->lockless_cq) {
- spin_lock(&ctx->completion_lock);
- io_cqring_event_overflow(ctx, cqe->user_data,
- cqe->res, cqe->flags, 0, 0);
- spin_unlock(&ctx->completion_lock);
- } else {
- io_cqring_event_overflow(ctx, cqe->user_data,
- cqe->res, cqe->flags, 0, 0);
- }
- }
- }
- state->cqes_count = 0;
-}
-
-static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
- bool allow_overflow)
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
- if (!filled && allow_overflow)
+ if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
io_cq_unlock_post(ctx);
return filled;
}
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
-{
- return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
-}
-
/*
* A helper for multishot requests posting additional CQEs.
* Should only be used from a task_work including IO_URING_F_MULTISHOT.
*/
-bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
+bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
- u64 user_data = req->cqe.user_data;
- struct io_uring_cqe *cqe;
+ bool posted;
lockdep_assert(!io_wq_current_is_worker());
-
- if (!defer)
- return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
-
lockdep_assert_held(&ctx->uring_lock);
- if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
- __io_cq_lock(ctx);
- __io_flush_post_cqes(ctx);
- /* no need to flush - flush is deferred */
- __io_cq_unlock_post(ctx);
- }
-
- /* For defered completions this is not as strict as it is otherwise,
- * however it's main job is to prevent unbounded posted completions,
- * and in that it works just as well.
- */
- if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
- return false;
-
- cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
- cqe->user_data = user_data;
- cqe->res = res;
- cqe->flags = cflags;
- return true;
+ __io_cq_lock(ctx);
+ posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+ ctx->submit_state.cq_flush = true;
+ __io_cq_unlock_post(ctx);
+ return posted;
}
-static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
+static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_rsrc_node *rsrc_node = NULL;
+
+ /*
+ * All execution paths but io-wq use the deferred completions by
+ * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here.
+ */
+ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ)))
+ return;
+
+ /*
+ * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
+ * the submitter task context, IOPOLL protects with uring_lock.
+ */
+ if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) {
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req);
+ return;
+ }
io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP)) {
if (!io_fill_cqe_req(ctx, req))
io_req_cqe_overflow(req);
}
+ io_cq_unlock_post(ctx);
/*
- * If we're the last reference to this request, add to our locked
- * free_list cache.
+ * We don't free the request here because we know it's called from
+ * io-wq only, which holds a reference, so it cannot be the last put.
*/
- if (req_ref_put_and_test(req)) {
- if (req->flags & IO_REQ_LINK_FLAGS) {
- if (req->flags & IO_DISARM_MASK)
- io_disarm_next(req);
- if (req->link) {
- io_req_task_queue(req->link);
- req->link = NULL;
- }
- }
- io_put_kbuf_comp(req);
- if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
- io_clean_op(req);
- io_put_file(req);
-
- rsrc_node = req->rsrc_node;
- /*
- * Selected buffer deallocation in io_clean_op() assumes that
- * we don't hold ->completion_lock. Clean them here to avoid
- * deadlocks.
- */
- io_put_task_remote(req->task);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- }
- io_cq_unlock_post(ctx);
-
- if (rsrc_node) {
- io_ring_submit_lock(ctx, issue_flags);
- io_put_rsrc_node(ctx, rsrc_node);
- io_ring_submit_unlock(ctx, issue_flags);
- }
-}
-
-void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (ctx->task_complete && ctx->submitter_task != current) {
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
- } else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
- !(ctx->flags & IORING_SETUP_IOPOLL)) {
- __io_req_complete_post(req, issue_flags);
- } else {
- mutex_lock(&ctx->uring_lock);
- __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
- mutex_unlock(&ctx->uring_lock);
- }
+ req_ref_put(req);
}
void io_req_defer_failed(struct io_kiocb *req, s32 res)
@@ -1065,15 +973,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
-static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
-{
- spin_lock(&ctx->completion_lock);
- wq_list_splice(&ctx->locked_free_list, &state->free_list);
- ctx->locked_free_nr = 0;
- spin_unlock(&ctx->completion_lock);
-}
-
/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -1085,18 +984,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
void *reqs[IO_REQ_ALLOC_BATCH];
- int ret, i;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
- io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
- if (!io_req_cache_empty(ctx))
- return true;
- }
+ int ret;
ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
@@ -1112,8 +1000,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
}
percpu_ref_get_many(&ctx->refs, ret);
- for (i = 0; i < ret; i++) {
- struct io_kiocb *req = reqs[i];
+ while (ret--) {
+ struct io_kiocb *req = reqs[ret];
io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
@@ -1163,11 +1051,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
return;
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
- if (ts->locked) {
- io_submit_flush_completions(ctx);
- mutex_unlock(&ctx->uring_lock);
- ts->locked = false;
- }
+
+ io_submit_flush_completions(ctx);
+ mutex_unlock(&ctx->uring_lock);
percpu_ref_put(&ctx->refs);
}
@@ -1191,8 +1077,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
if (req->ctx != ctx) {
ctx_flush_and_put(ctx, &ts);
ctx = req->ctx;
- /* if not contended, grab and improve batching */
- ts.locked = mutex_trylock(&ctx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
percpu_ref_get(&ctx->refs);
}
INDIRECT_CALL_2(req->io_task_work.func,
@@ -1453,11 +1338,9 @@ again:
if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
- if (ts->locked) {
- io_submit_flush_completions(ctx);
- if (io_run_local_work_continue(ctx, ret, min_events))
- goto again;
- }
+ io_submit_flush_completions(ctx);
+ if (io_run_local_work_continue(ctx, ret, min_events))
+ goto again;
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
@@ -1466,17 +1349,11 @@ again:
static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
int min_events)
{
- struct io_tw_state ts = { .locked = true, };
- int ret;
+ struct io_tw_state ts = {};
if (llist_empty(&ctx->work_llist))
return 0;
-
- ret = __io_run_local_work(ctx, &ts, min_events);
- /* shouldn't happen! */
- if (WARN_ON_ONCE(!ts.locked))
- mutex_lock(&ctx->uring_lock);
- return ret;
+ return __io_run_local_work(ctx, &ts, min_events);
}
static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
@@ -1484,11 +1361,9 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
struct io_tw_state ts = {};
int ret;
- ts.locked = mutex_trylock(&ctx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
ret = __io_run_local_work(ctx, &ts, min_events);
- if (ts.locked)
- mutex_unlock(&ctx->uring_lock);
-
+ mutex_unlock(&ctx->uring_lock);
return ret;
}
@@ -1505,7 +1380,7 @@ void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
if (unlikely(req->task->flags & PF_EXITING))
io_req_defer_failed(req, -EFAULT);
else if (req->flags & REQ_F_FORCE_ASYNC)
- io_queue_iowq(req, ts);
+ io_queue_iowq(req);
else
io_queue_sqe(req);
}
@@ -1550,7 +1425,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
if (apoll->double_poll)
kfree(apoll->double_poll);
- if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
+ if (!io_alloc_cache_put(&ctx->apoll_cache, apoll))
kfree(apoll);
req->flags &= ~REQ_F_POLLED;
}
@@ -1560,10 +1435,9 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
io_clean_op(req);
}
io_put_file(req);
-
- io_req_put_rsrc_locked(req, ctx);
-
+ io_put_rsrc_node(ctx, req->rsrc_node);
io_put_task(req->task);
+
node = req->comp_list.next;
io_req_add_to_cache(req, ctx);
} while (node);
@@ -1576,9 +1450,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_wq_work_node *node;
__io_cq_lock(ctx);
- /* must come first to preserve CQE ordering in failure cases */
- if (state->cqes_count)
- __io_flush_post_cqes(ctx);
__wq_list_for_each(node, &state->compl_reqs) {
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
@@ -1600,6 +1471,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
+ ctx->submit_state.cq_flush = false;
}
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
@@ -1642,13 +1514,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
unsigned int nr_events = 0;
unsigned long check_cq;
+ lockdep_assert_held(&ctx->uring_lock);
+
if (!io_allowed_run_tw(ctx))
return -EEXIST;
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
- __io_cqring_overflow_flush(ctx);
+ __io_cqring_overflow_flush(ctx, false);
/*
* Similarly do not spin if we have not informed the user of any
* dropped CQE.
@@ -1711,10 +1585,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
- if (ts->locked)
- io_req_complete_defer(req);
- else
- io_req_complete_post(req, IO_URING_F_UNLOCKED);
+ io_req_complete_defer(req);
}
/*
@@ -1785,8 +1656,10 @@ io_req_flags_t io_file_get_flags(struct file *file)
bool io_alloc_async_data(struct io_kiocb *req)
{
- WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
- req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
+ const struct io_issue_def *def = &io_issue_defs[req->opcode];
+
+ WARN_ON_ONCE(!def->async_size);
+ req->async_data = kmalloc(def->async_size, GFP_KERNEL);
if (req->async_data) {
req->flags |= REQ_F_ASYNC_DATA;
return false;
@@ -1794,25 +1667,6 @@ bool io_alloc_async_data(struct io_kiocb *req)
return true;
}
-int io_req_prep_async(struct io_kiocb *req)
-{
- const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
-
- /* assign early for deferred execution for non-fixed file */
- if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
- req->file = io_file_get_normal(req, req->cqe.fd);
- if (!cdef->prep_async)
- return 0;
- if (WARN_ON_ONCE(req_has_async_data(req)))
- return -EFAULT;
- if (!def->manual_alloc) {
- if (io_alloc_async_data(req))
- return -EAGAIN;
- }
- return cdef->prep_async(req);
-}
-
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
@@ -2093,7 +1947,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)
break;
case IO_APOLL_ABORTED:
io_kbuf_recycle(req, 0);
- io_queue_iowq(req, NULL);
+ io_queue_iowq(req);
break;
case IO_APOLL_OK:
break;
@@ -2130,17 +1984,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
req->flags |= REQ_F_LINK;
io_req_defer_failed(req, req->cqe.res);
} else {
- int ret = io_req_prep_async(req);
-
- if (unlikely(ret)) {
- io_req_defer_failed(req, ret);
- return;
- }
-
if (unlikely(req->ctx->drain_active))
io_drain_req(req);
else
- io_queue_iowq(req, NULL);
+ io_queue_iowq(req);
}
}
@@ -2346,10 +2193,6 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
* conditions are true (normal request), then just queue it.
*/
if (unlikely(link->head)) {
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- return io_submit_fail_init(sqe, req, ret);
-
trace_io_uring_link(req, link->head);
link->last->link = req;
link->last = req;
@@ -2597,8 +2440,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx, min_events);
io_run_task_work();
- io_cqring_overflow_flush(ctx);
- /* if user messes with these they will just get an early return */
+
+ if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
+ io_cqring_do_overflow_flush(ctx);
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
@@ -2698,89 +2542,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-void io_mem_free(void *ptr)
-{
- if (!ptr)
- return;
-
- folio_put(virt_to_folio(ptr));
-}
-
-static void io_pages_free(struct page ***pages, int npages)
-{
- struct page **page_array = *pages;
- int i;
-
- if (!page_array)
- return;
-
- for (i = 0; i < npages; i++)
- unpin_user_page(page_array[i]);
- kvfree(page_array);
- *pages = NULL;
-}
-
-static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
- unsigned long uaddr, size_t size)
-{
- struct page **page_array;
- unsigned int nr_pages;
- void *page_addr;
- int ret, i, pinned;
-
- *npages = 0;
-
- if (uaddr & (PAGE_SIZE - 1) || !size)
- return ERR_PTR(-EINVAL);
-
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (nr_pages > USHRT_MAX)
- return ERR_PTR(-EINVAL);
- page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!page_array)
- return ERR_PTR(-ENOMEM);
-
-
- pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- page_array);
- if (pinned != nr_pages) {
- ret = (pinned < 0) ? pinned : -EFAULT;
- goto free_pages;
- }
-
- page_addr = page_address(page_array[0]);
- for (i = 0; i < nr_pages; i++) {
- ret = -EINVAL;
-
- /*
- * Can't support mapping user allocated ring memory on 32-bit
- * archs where it could potentially reside in highmem. Just
- * fail those with -EINVAL, just like we did on kernels that
- * didn't support this feature.
- */
- if (PageHighMem(page_array[i]))
- goto free_pages;
-
- /*
- * No support for discontig pages for now, should either be a
- * single normal page, or a huge page. Later on we can add
- * support for remapping discontig pages, for now we will
- * just fail them with EINVAL.
- */
- if (page_address(page_array[i]) != page_addr)
- goto free_pages;
- page_addr += PAGE_SIZE;
- }
-
- *pages = page_array;
- *npages = nr_pages;
- return page_to_virt(page_array[0]);
-
-free_pages:
- io_pages_free(&page_array, pinned > 0 ? pinned : 0);
- return ERR_PTR(ret);
-}
-
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{
@@ -2798,30 +2559,23 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
+ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
+ true);
+ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
+ true);
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
+ vunmap(ctx->rings);
+ vunmap(ctx->sq_sqes);
}
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
-void *io_mem_alloc(size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- void *ret;
-
- ret = (void *) __get_free_pages(gfp, get_order(size));
- if (ret)
- return ret;
- return ERR_PTR(-ENOMEM);
-}
-
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
@@ -2867,7 +2621,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
int nr = 0;
mutex_lock(&ctx->uring_lock);
- io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
@@ -2879,11 +2632,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
-{
- kfree(container_of(entry, struct io_rsrc_node, cache));
-}
-
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -2898,8 +2646,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
- io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
+ io_alloc_cache_free(&ctx->apoll_cache, kfree);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+ io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+ io_alloc_cache_free(&ctx->uring_cache, kfree);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -2915,13 +2665,12 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
- io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
+ io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
io_rings_free(ctx);
- io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@@ -3145,17 +2894,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
- if (ctx->rings)
- io_poll_remove_all(ctx, NULL, true);
mutex_unlock(&ctx->uring_lock);
- /*
- * If we failed setting up the ctx, we might not have any rings
- * and therefore did not submit any requests
- */
- if (ctx->rings)
- io_kill_timeouts(ctx, NULL, true);
-
flush_delayed_work(&ctx->fallback_work);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
@@ -3241,37 +2981,6 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
-{
- struct hlist_node *tmp;
- struct io_kiocb *req;
- bool ret = false;
-
- lockdep_assert_held(&ctx->uring_lock);
-
- hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
- hash_node) {
- struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
- struct io_uring_cmd);
- struct file *file = req->file;
-
- if (!cancel_all && req->task != task)
- continue;
-
- if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
- /* ->sqe isn't available if no async data */
- if (!req_has_async_data(req))
- cmd->sqe = NULL;
- file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
- ret = true;
- }
- }
- io_submit_flush_completions(ctx);
-
- return ret;
-}
-
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@@ -3326,6 +3035,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
ret |= io_run_task_work() > 0;
+ else
+ ret |= flush_delayed_work(&ctx->fallback_work);
return ret;
}
@@ -3424,137 +3135,6 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
-static void *io_uring_validate_mmap_request(struct file *file,
- loff_t pgoff, size_t sz)
-{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t offset = pgoff << PAGE_SHIFT;
- struct page *page;
- void *ptr;
-
- switch (offset & IORING_OFF_MMAP_MASK) {
- case IORING_OFF_SQ_RING:
- case IORING_OFF_CQ_RING:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- ptr = ctx->rings;
- break;
- case IORING_OFF_SQES:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- ptr = ctx->sq_sqes;
- break;
- case IORING_OFF_PBUF_RING: {
- struct io_buffer_list *bl;
- unsigned int bgid;
-
- bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- bl = io_pbuf_get_bl(ctx, bgid);
- if (IS_ERR(bl))
- return bl;
- ptr = bl->buf_ring;
- io_put_bl(ctx, bl);
- break;
- }
- default:
- return ERR_PTR(-EINVAL);
- }
-
- page = virt_to_head_page(ptr);
- if (sz > page_size(page))
- return ERR_PTR(-EINVAL);
-
- return ptr;
-}
-
-#ifdef CONFIG_MMU
-
-static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- size_t sz = vma->vm_end - vma->vm_start;
- unsigned long pfn;
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
-}
-
-static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- /*
- * Do not allow to map to user-provided address to avoid breaking the
- * aliasing rules. Userspace is not able to guess the offset address of
- * kernel kmalloc()ed memory area.
- */
- if (addr)
- return -EINVAL;
-
- ptr = io_uring_validate_mmap_request(filp, pgoff, len);
- if (IS_ERR(ptr))
- return -ENOMEM;
-
- /*
- * Some architectures have strong cache aliasing requirements.
- * For such architectures we need a coherent mapping which aliases
- * kernel memory *and* userspace memory. To achieve that:
- * - use a NULL file pointer to reference physical memory, and
- * - use the kernel virtual address of the shared io_uring context
- * (instead of the userspace-provided address, which has to be 0UL
- * anyway).
- * - use the same pgoff which the get_unmapped_area() uses to
- * calculate the page colouring.
- * For architectures without such aliasing requirements, the
- * architecture will return any suitable mapping because addr is 0.
- */
- filp = NULL;
- flags |= MAP_SHARED;
- pgoff = 0; /* has been translated to ptr above */
-#ifdef SHM_COLOUR
- addr = (uintptr_t) ptr;
- pgoff = addr >> PAGE_SHIFT;
-#else
- addr = 0UL;
-#endif
- return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
-}
-
-#else /* !CONFIG_MMU */
-
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
-}
-
-static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
-{
- return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
-}
-
-static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, pgoff, len);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- return (unsigned long) ptr;
-}
-
-#endif /* !CONFIG_MMU */
-
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
if (flags & IORING_ENTER_EXT_ARG) {
@@ -3647,8 +3227,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx);
-
if (unlikely(ctx->sq_data->thread == NULL)) {
ret = -EOWNERDEAD;
goto out;
@@ -3737,11 +3315,9 @@ out:
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,
+ .get_unmapped_area = io_uring_get_unmapped_area,
#ifndef CONFIG_MMU
- .get_unmapped_area = io_uring_nommu_get_unmapped_area,
.mmap_capabilities = io_uring_nommu_mmap_capabilities,
-#else
- .get_unmapped_area = io_uring_mmu_get_unmapped_area,
#endif
.poll = io_uring_poll,
#ifdef CONFIG_PROC_FS
@@ -3770,7 +3346,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- rings = io_mem_alloc(size);
+ rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
else
rings = io_rings_map(ctx, p->cq_off.user_addr, size);
@@ -3795,7 +3371,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
}
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- ptr = io_mem_alloc(size);
+ ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
else
ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
@@ -3994,7 +3570,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
- IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
+ IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
+ IORING_FEAT_RECVSEND_BUNDLE;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6426ee382276b5..624ca9076a50be 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -62,16 +62,12 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
}
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
-void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
-void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
+bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
-
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
@@ -79,7 +75,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
-void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
@@ -97,7 +92,6 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
void io_wq_submit_work(struct io_wq_work *work);
@@ -110,9 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
-void *io_mem_alloc(size_t size);
-void io_mem_free(void *ptr);
-
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
@@ -121,9 +112,9 @@ enum {
void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx);
-#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
+#if defined(CONFIG_PROVE_LOCKING)
lockdep_assert(in_task());
if (ctx->flags & IORING_SETUP_IOPOLL) {
@@ -142,18 +133,21 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
else
lockdep_assert(current == ctx->submitter_task);
}
-}
-#else
-static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
-{
-}
#endif
+}
static inline void io_req_task_work_add(struct io_kiocb *req)
{
__io_req_task_work_add(req, 0);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
+ ctx->submit_state.cq_flush)
+ __io_submit_flush_completions(ctx);
+}
+
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
@@ -340,15 +334,12 @@ static inline int io_run_task_work(void)
static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
{
- return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
+ return task_work_pending(current) || !llist_empty(&ctx->work_llist);
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{
- if (!ts->locked) {
- mutex_lock(&ctx->uring_lock);
- ts->locked = true;
- }
+ lockdep_assert_held(&ctx->uring_lock);
}
/*
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3aa16e27f5099a..d2945c9c812b5c 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
+#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -14,8 +15,7 @@
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
-
-#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
+#include "memmap.h"
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
@@ -31,25 +31,12 @@ struct io_provide_buf {
__u16 bid;
};
-struct io_buf_free {
- struct hlist_node list;
- void *mem;
- size_t size;
- int inuse;
-};
-
-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
- unsigned int bgid)
-{
- return xa_load(&ctx->io_bl_xa, bgid);
-}
-
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
lockdep_assert_held(&ctx->uring_lock);
- return __io_buffer_get_list(ctx, bgid);
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -130,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
return NULL;
}
+static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ struct iovec *iov)
+{
+ void __user *buf;
+
+ buf = io_provided_buffer_select(req, len, bl);
+ if (unlikely(!buf))
+ return -ENOBUFS;
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = *len;
+ return 0;
+}
+
+static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
+ __u16 head, __u16 mask)
+{
+ return &br->bufs[head & mask];
+}
+
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
@@ -145,19 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
if (head + 1 == tail)
req->flags |= REQ_F_BL_EMPTY;
- head &= bl->mask;
- /* mmaped buffers are always contig */
- if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
- buf = &br->bufs[head];
- } else {
- int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
- buf = page_address(bl->buf_pages[index]);
- buf += off;
- }
+ buf = io_ring_head_to_buf(br, head, bl->mask);
if (*len == 0 || *len > buf->len)
*len = buf->len;
- req->flags |= REQ_F_BUFFER_RING;
+ req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_list = bl;
req->buf_index = buf->bid;
@@ -172,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
+ req->flags &= ~REQ_F_BUFFERS_COMMIT;
req->buf_list = NULL;
bl->head++;
}
@@ -198,22 +198,134 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
-/*
- * Mark the given mapped range as free for reuse
- */
-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+/* cap it at a reasonable 256, will be one page even for 4K */
+#define PEEK_MAX_IMPORT 256
+
+static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
+ struct io_buffer_list *bl)
+{
+ struct io_uring_buf_ring *br = bl->buf_ring;
+ struct iovec *iov = arg->iovs;
+ int nr_iovs = arg->nr_iovs;
+ __u16 nr_avail, tail, head;
+ struct io_uring_buf *buf;
+
+ tail = smp_load_acquire(&br->tail);
+ head = bl->head;
+ nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
+ if (unlikely(!nr_avail))
+ return -ENOBUFS;
+
+ buf = io_ring_head_to_buf(br, head, bl->mask);
+ if (arg->max_len) {
+ int needed;
+
+ needed = (arg->max_len + buf->len - 1) / buf->len;
+ needed = min(needed, PEEK_MAX_IMPORT);
+ if (nr_avail > needed)
+ nr_avail = needed;
+ }
+
+ /*
+ * only alloc a bigger array if we know we have data to map, eg not
+ * a speculative peek operation.
+ */
+ if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
+ iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
+ if (unlikely(!iov))
+ return -ENOMEM;
+ if (arg->mode & KBUF_MODE_FREE)
+ kfree(arg->iovs);
+ arg->iovs = iov;
+ nr_iovs = nr_avail;
+ } else if (nr_avail < nr_iovs) {
+ nr_iovs = nr_avail;
+ }
+
+ /* set it to max, if not set, so we can use it unconditionally */
+ if (!arg->max_len)
+ arg->max_len = INT_MAX;
+
+ req->buf_index = buf->bid;
+ do {
+ /* truncate end piece, if needed */
+ if (buf->len > arg->max_len)
+ buf->len = arg->max_len;
+
+ iov->iov_base = u64_to_user_ptr(buf->addr);
+ iov->iov_len = buf->len;
+ iov++;
+
+ arg->out_len += buf->len;
+ arg->max_len -= buf->len;
+ if (!arg->max_len)
+ break;
+
+ buf = io_ring_head_to_buf(br, ++head, bl->mask);
+ } while (--nr_iovs);
+
+ if (head == tail)
+ req->flags |= REQ_F_BL_EMPTY;
+
+ req->flags |= REQ_F_BUFFER_RING;
+ req->buf_list = bl;
+ return iov - arg->iovs;
+}
+
+int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
+ unsigned int issue_flags)
{
- struct io_buf_free *ibf;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret = -ENOENT;
- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
- if (bl->buf_ring == ibf->mem) {
- ibf->inuse = 0;
- return;
+ io_ring_submit_lock(ctx, issue_flags);
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (unlikely(!bl))
+ goto out_unlock;
+
+ if (bl->is_buf_ring) {
+ ret = io_ring_buffers_peek(req, arg, bl);
+ /*
+ * Don't recycle these buffers if we need to go through poll.
+ * Nobody else can use them anyway, and holding on to provided
+ * buffers for a send/write operation would happen on the app
+ * side anyway with normal buffers. Besides, we already
+ * committed them, they cannot be put back in the queue.
+ */
+ if (ret > 0) {
+ req->flags |= REQ_F_BL_NO_RECYCLE;
+ req->buf_list->head += ret;
}
+ } else {
+ ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
+ }
+out_unlock:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (unlikely(!bl))
+ return -ENOENT;
+
+ if (bl->is_buf_ring) {
+ ret = io_ring_buffers_peek(req, arg, bl);
+ if (ret > 0)
+ req->flags |= REQ_F_BUFFERS_COMMIT;
+ return ret;
}
- /* can't happen... */
- WARN_ON_ONCE(1);
+ /* don't support multiple buffer selections for legacy */
+ return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
@@ -227,22 +339,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_buf_ring) {
i = bl->buf_ring->tail - bl->head;
- if (bl->is_mmap) {
- /*
- * io_kbuf_list_free() will free the page(s) at
- * ->release() time.
- */
- io_kbuf_mark_free(ctx, bl);
- bl->buf_ring = NULL;
- bl->is_mmap = 0;
- } else if (bl->buf_nr_pages) {
+ if (bl->buf_nr_pages) {
int j;
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- kvfree(bl->buf_pages);
- bl->buf_pages = NULL;
- bl->buf_nr_pages = 0;
+ if (!bl->is_mmap) {
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ }
+ io_pages_unmap(bl->buf_ring, &bl->buf_pages,
+ &bl->buf_nr_pages, bl->is_mmap);
+ bl->is_mmap = 0;
}
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
@@ -498,9 +604,9 @@ err:
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- struct io_uring_buf_ring *br;
+ struct io_uring_buf_ring *br = NULL;
struct page **pages;
- int i, nr_pages;
+ int nr_pages, ret;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
@@ -508,18 +614,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
if (IS_ERR(pages))
return PTR_ERR(pages);
- /*
- * Apparently some 32-bit boxes (ARM) will return highmem pages,
- * which then need to be mapped. We could support that, but it'd
- * complicate the code and slowdown the common cases quite a bit.
- * So just error out, returning -EINVAL just like we did on kernels
- * that didn't support mapped buffer rings.
- */
- for (i = 0; i < nr_pages; i++)
- if (PageHighMem(pages[i]))
- goto error_unpin;
+ br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!br) {
+ ret = -ENOMEM;
+ goto error_unpin;
+ }
- br = page_address(pages[0]);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
@@ -530,8 +630,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
- if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
+ if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
+ ret = -EINVAL;
goto error_unpin;
+ }
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
@@ -540,69 +642,24 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->is_mmap = 0;
return 0;
error_unpin:
- for (i = 0; i < nr_pages; i++)
- unpin_user_page(pages[i]);
+ unpin_user_pages(pages, nr_pages);
kvfree(pages);
- return -EINVAL;
-}
-
-/*
- * See if we have a suitable region that we can reuse, rather than allocate
- * both a new io_buf_free and mem region again. We leave it on the list as
- * even a reused entry will need freeing at ring release.
- */
-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
- size_t ring_size)
-{
- struct io_buf_free *ibf, *best = NULL;
- size_t best_dist;
-
- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
- size_t dist;
-
- if (ibf->inuse || ibf->size < ring_size)
- continue;
- dist = ibf->size - ring_size;
- if (!best || dist < best_dist) {
- best = ibf;
- if (!dist)
- break;
- best_dist = dist;
- }
- }
-
- return best;
+ vunmap(br);
+ return ret;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- struct io_buf_free *ibf;
size_t ring_size;
- void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
- /* Reuse existing entry, if we can */
- ibf = io_lookup_buf_free_entry(ctx, ring_size);
- if (!ibf) {
- ptr = io_mem_alloc(ring_size);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- /* Allocate and store deferred free entry */
- ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
- if (!ibf) {
- io_mem_free(ptr);
- return -ENOMEM;
- }
- ibf->mem = ptr;
- ibf->size = ring_size;
- hlist_add_head(&ibf->list, &ctx->io_buf_list);
- }
- ibf->inuse = 1;
- bl->buf_ring = ibf->mem;
+ bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
+ if (!bl->buf_ring)
+ return -ENOMEM;
+
bl->is_buf_ring = 1;
bl->is_mmap = 1;
return 0;
@@ -750,18 +807,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
return ERR_PTR(-EINVAL);
}
-/*
- * Called at or after ->release(), free the mmap'ed buffers that we used
- * for memory mapped provided buffer rings.
- */
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct io_buf_free *ibf;
- struct hlist_node *tmp;
+ struct io_ring_ctx *ctx = file->private_data;
+ loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
+ struct io_buffer_list *bl;
+ int bgid, ret;
- hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
- hlist_del(&ibf->list);
- io_mem_free(ibf->mem);
- kfree(ibf);
- }
+ bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return PTR_ERR(bl);
+
+ ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
+ io_put_bl(ctx, bl);
+ return ret;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index df365b8860cf1e..b90aca3a57fa7c 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -41,8 +41,26 @@ struct io_buffer {
__u16 bgid;
};
+enum {
+ /* can alloc a bigger vec */
+ KBUF_MODE_EXPAND = 1,
+ /* if bigger vec allocated, free old one */
+ KBUF_MODE_FREE = 2,
+};
+
+struct buf_sel_arg {
+ struct iovec *iovs;
+ size_t out_len;
+ size_t max_len;
+ int nr_iovs;
+ int mode;
+};
+
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags);
+int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
+ unsigned int issue_flags);
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -55,8 +73,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
-
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@@ -64,6 +80,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
unsigned long bgid);
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
@@ -76,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
*/
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
- req->flags &= ~REQ_F_BUFFER_RING;
+ req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
return false;
@@ -100,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false;
}
-static inline void __io_put_kbuf_ring(struct io_kiocb *req)
+static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
{
- if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
- req->buf_list->head++;
+ struct io_buffer_list *bl = req->buf_list;
+
+ if (bl) {
+ if (req->flags & REQ_F_BUFFERS_COMMIT) {
+ bl->head += nr;
+ req->flags &= ~REQ_F_BUFFERS_COMMIT;
+ }
+ req->buf_index = bl->bgid;
}
req->flags &= ~REQ_F_BUFFER_RING;
}
@@ -113,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
{
if (req->flags & REQ_F_BUFFER_RING) {
- __io_put_kbuf_ring(req);
+ __io_put_kbuf_ring(req, 1);
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
@@ -121,22 +143,18 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req,
}
}
-static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
+static inline void io_kbuf_drop(struct io_kiocb *req)
{
- unsigned int ret;
-
lockdep_assert_held(&req->ctx->completion_lock);
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
- return 0;
+ return;
- ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
- return ret;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
- unsigned issue_flags)
+static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
+ unsigned issue_flags)
{
unsigned int ret;
@@ -145,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING)
- __io_put_kbuf_ring(req);
+ __io_put_kbuf_ring(req, nbufs);
else
__io_put_kbuf(req, issue_flags);
return ret;
}
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
+{
+ return __io_put_kbufs(req, 1, issue_flags);
+}
+
+static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
+ unsigned issue_flags)
+{
+ return __io_put_kbufs(req, nbufs, issue_flags);
+}
#endif
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
new file mode 100644
index 00000000000000..4785d6af5fee92
--- /dev/null
+++ b/io_uring/memmap.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
+#include <asm/shmparam.h>
+
+#include "memmap.h"
+#include "kbuf.h"
+
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+ size_t size, gfp_t gfp)
+{
+ struct page *page;
+ int i, order;
+
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return ERR_PTR(-ENOMEM);
+ else if (order)
+ gfp |= __GFP_COMP;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_pages; i++)
+ pages[i] = page + i;
+
+ return page_address(page);
+}
+
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+ gfp_t gfp)
+{
+ void *ret;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(gfp);
+ if (!pages[i])
+ goto err;
+ }
+
+ ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (ret)
+ return ret;
+err:
+ while (i--)
+ put_page(pages[i]);
+ return ERR_PTR(-ENOMEM);
+}
+
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages;
+ int nr_pages;
+ void *ret;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+ if (!IS_ERR(ret))
+ goto done;
+
+ ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+ if (!IS_ERR(ret)) {
+done:
+ *out_pages = pages;
+ *npages = nr_pages;
+ return ret;
+ }
+
+ kvfree(pages);
+ *out_pages = NULL;
+ *npages = 0;
+ return ret;
+}
+
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages)
+{
+ bool do_vunmap = false;
+
+ if (!ptr)
+ return;
+
+ if (put_pages && *npages) {
+ struct page **to_free = *pages;
+ int i;
+
+ /*
+ * Only did vmap for the non-compound multiple page case.
+ * For the compound page, we just need to put the head.
+ */
+ if (PageCompound(to_free[0]))
+ *npages = 1;
+ else if (*npages > 1)
+ do_vunmap = true;
+ for (i = 0; i < *npages; i++)
+ put_page(to_free[i]);
+ }
+ if (do_vunmap)
+ vunmap(ptr);
+ kvfree(*pages);
+ *pages = NULL;
+ *npages = 0;
+}
+
+void io_pages_free(struct page ***pages, int npages)
+{
+ struct page **page_array = *pages;
+
+ if (!page_array)
+ return;
+
+ unpin_user_pages(page_array, npages);
+ kvfree(page_array);
+ *pages = NULL;
+}
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+ unsigned long start, end, nr_pages;
+ struct page **pages;
+ int ret;
+
+ end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+ nr_pages = end - start;
+ if (WARN_ON_ONCE(!nr_pages))
+ return ERR_PTR(-EINVAL);
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
+ /* if we did partial map, release any pages we did get */
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+ unsigned long uaddr, size_t size)
+{
+ struct page **page_array;
+ unsigned int nr_pages;
+ void *page_addr;
+
+ *npages = 0;
+
+ if (uaddr & (PAGE_SIZE - 1) || !size)
+ return ERR_PTR(-EINVAL);
+
+ nr_pages = 0;
+ page_array = io_pin_pages(uaddr, size, &nr_pages);
+ if (IS_ERR(page_array))
+ return page_array;
+
+ page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (page_addr) {
+ *pages = page_array;
+ *npages = nr_pages;
+ return page_addr;
+ }
+
+ io_pages_free(&page_array, nr_pages);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
+ size_t sz)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ loff_t offset = pgoff << PAGE_SHIFT;
+
+ switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
+ return ctx->rings;
+ case IORING_OFF_SQES:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
+ return ctx->sq_sqes;
+ case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
+ unsigned int bgid;
+ void *ptr;
+
+ bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ptr = bl->buf_ring;
+ io_put_bl(ctx, bl);
+ return ptr;
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages)
+{
+ unsigned long nr_pages = npages;
+
+ vm_flags_set(vma, VM_DONTEXPAND);
+ return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
+#ifdef CONFIG_MMU
+
+__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ size_t sz = vma->vm_end - vma->vm_start;
+ long offset = vma->vm_pgoff << PAGE_SHIFT;
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ switch (offset & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+ ctx->n_ring_pages);
+ case IORING_OFF_SQES:
+ return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+ ctx->n_sqe_pages);
+ case IORING_OFF_PBUF_RING:
+ return io_pbuf_mmap(file, vma);
+ }
+
+ return -EINVAL;
+}
+
+unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ void *ptr;
+
+ /*
+ * Do not allow to map to user-provided address to avoid breaking the
+ * aliasing rules. Userspace is not able to guess the offset address of
+ * kernel kmalloc()ed memory area.
+ */
+ if (addr)
+ return -EINVAL;
+
+ ptr = io_uring_validate_mmap_request(filp, pgoff, len);
+ if (IS_ERR(ptr))
+ return -ENOMEM;
+
+ /*
+ * Some architectures have strong cache aliasing requirements.
+ * For such architectures we need a coherent mapping which aliases
+ * kernel memory *and* userspace memory. To achieve that:
+ * - use a NULL file pointer to reference physical memory, and
+ * - use the kernel virtual address of the shared io_uring context
+ * (instead of the userspace-provided address, which has to be 0UL
+ * anyway).
+ * - use the same pgoff which the get_unmapped_area() uses to
+ * calculate the page colouring.
+ * For architectures without such aliasing requirements, the
+ * architecture will return any suitable mapping because addr is 0.
+ */
+ filp = NULL;
+ flags |= MAP_SHARED;
+ pgoff = 0; /* has been translated to ptr above */
+#ifdef SHM_COLOUR
+ addr = (uintptr_t) ptr;
+ pgoff = addr >> PAGE_SHIFT;
+#else
+ addr = 0UL;
+#endif
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+}
+
+#else /* !CONFIG_MMU */
+
+int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
+}
+
+unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+}
+
+unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, pgoff, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ return (unsigned long) ptr;
+}
+
+#endif /* !CONFIG_MMU */
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
new file mode 100644
index 00000000000000..5cec5b7ac49a45
--- /dev/null
+++ b/io_uring/memmap.h
@@ -0,0 +1,25 @@
+#ifndef IO_URING_MEMMAP_H
+#define IO_URING_MEMMAP_H
+
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+void io_pages_free(struct page ***pages, int npages);
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages);
+
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size);
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages);
+
+void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+ unsigned long uaddr, size_t size);
+
+#ifndef CONFIG_MMU
+unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
+#endif
+unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags);
+int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index cd6dcf634ba3cd..81c4a9d437296c 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -83,7 +83,7 @@ static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func)
return -EOWNERDEAD;
init_task_work(&msg->tw, func);
- if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL))
+ if (task_work_add(task, &msg->tw, TWA_SIGNAL))
return -EOWNERDEAD;
return IOU_ISSUE_SKIP_COMPLETE;
@@ -147,13 +147,11 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
if (target_ctx->flags & IORING_SETUP_IOPOLL) {
if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
return -EAGAIN;
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
- ret = 0;
- io_double_unlock_ctx(target_ctx);
- } else {
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
- ret = 0;
}
+ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
+ ret = 0;
+ if (target_ctx->flags & IORING_SETUP_IOPOLL)
+ io_double_unlock_ctx(target_ctx);
return ret;
}
diff --git a/io_uring/net.c b/io_uring/net.c
index 4afb475d41974b..b0bf8471ecb77f 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -57,7 +57,7 @@ struct io_sr_msg {
struct user_msghdr __user *umsg;
void __user *buf;
};
- unsigned len;
+ int len;
unsigned done_io;
unsigned msg_flags;
unsigned nr_multishot_loops;
@@ -115,80 +115,85 @@ static bool io_net_retry(struct socket *sock, int flags)
return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}
+static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
+{
+ if (kmsg->free_iov) {
+ kfree(kmsg->free_iov);
+ kmsg->free_iov_nr = 0;
+ kmsg->free_iov = NULL;
+ }
+}
+
static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr *hdr = req->async_data;
+ struct iovec *iov;
- if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
+ /* can't recycle, ensure we free the iovec if we have one */
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_netmsg_iovec_free(hdr);
return;
+ }
/* Let normal cleanup path reap it if we fail adding to the cache */
- if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+ iov = hdr->free_iov;
+ if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
+ if (iov)
+ kasan_mempool_poison_object(iov);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
}
}
-static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
- unsigned int issue_flags)
+static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_cache_entry *entry;
struct io_async_msghdr *hdr;
- if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- entry = io_alloc_cache_get(&ctx->netmsg_cache);
- if (entry) {
- hdr = container_of(entry, struct io_async_msghdr, cache);
- hdr->free_iov = NULL;
- req->flags |= REQ_F_ASYNC_DATA;
- req->async_data = hdr;
- return hdr;
+ hdr = io_alloc_cache_get(&ctx->netmsg_cache);
+ if (hdr) {
+ if (hdr->free_iov) {
+ kasan_mempool_unpoison_object(hdr->free_iov,
+ hdr->free_iov_nr * sizeof(struct iovec));
+ req->flags |= REQ_F_NEED_CLEANUP;
}
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = hdr;
+ return hdr;
}
if (!io_alloc_async_data(req)) {
hdr = req->async_data;
+ hdr->free_iov_nr = 0;
hdr->free_iov = NULL;
return hdr;
}
return NULL;
}
-static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
+/* assign new iovec to kmsg, if we need to */
+static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg,
+ struct iovec *iov)
{
- /* ->prep_async is always called from the submission context */
- return io_msg_alloc_async(req, 0);
+ if (iov) {
+ req->flags |= REQ_F_NEED_CLEANUP;
+ kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+ kmsg->free_iov = iov;
+ }
+ return 0;
}
-static int io_setup_async_msg(struct io_kiocb *req,
- struct io_async_msghdr *kmsg,
- unsigned int issue_flags)
+static inline void io_mshot_prep_retry(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg)
{
- struct io_async_msghdr *async_msg;
-
- if (req_has_async_data(req))
- return -EAGAIN;
- async_msg = io_msg_alloc_async(req, issue_flags);
- if (!async_msg) {
- kfree(kmsg->free_iov);
- return -ENOMEM;
- }
- req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(async_msg, kmsg, sizeof(*kmsg));
- if (async_msg->msg.msg_name)
- async_msg->msg.msg_name = &async_msg->addr;
-
- if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
- return -EAGAIN;
-
- /* if were using fast_iov, set it to the new one */
- if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
- size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
- async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
- }
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- return -EAGAIN;
+ req->flags &= ~REQ_F_BL_EMPTY;
+ sr->done_io = 0;
+ sr->len = 0; /* get from the provided buffer */
+ req->buf_index = sr->buf_group;
}
#ifdef CONFIG_COMPAT
@@ -198,7 +203,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct compat_iovec __user *uiov;
- int ret;
+ struct iovec *iov;
+ int ret, nr_segs;
+
+ if (iomsg->free_iov) {
+ nr_segs = iomsg->free_iov_nr;
+ iov = iomsg->free_iov;
+ } else {
+ iov = &iomsg->fast_iov;
+ nr_segs = 1;
+ }
if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
return -EFAULT;
@@ -207,9 +221,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
if (req->flags & REQ_F_BUFFER_SELECT) {
compat_ssize_t clen;
- iomsg->free_iov = NULL;
if (msg->msg_iovlen == 0) {
- sr->len = 0;
+ sr->len = iov->iov_len = 0;
+ iov->iov_base = NULL;
} else if (msg->msg_iovlen > 1) {
return -EINVAL;
} else {
@@ -225,14 +239,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
return 0;
}
- iomsg->free_iov = iomsg->fast_iov;
ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
+ nr_segs, &iov, &iomsg->msg.msg_iter, true);
if (unlikely(ret < 0))
return ret;
- return 0;
+ return io_net_vec_assign(req, iomsg, iov);
}
#endif
@@ -240,7 +252,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr *msg, int ddir)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- int ret;
+ struct iovec *iov;
+ int ret, nr_segs;
+
+ if (iomsg->free_iov) {
+ nr_segs = iomsg->free_iov_nr;
+ iov = iomsg->free_iov;
+ } else {
+ iov = &iomsg->fast_iov;
+ nr_segs = 1;
+ }
if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))
return -EFAULT;
@@ -256,9 +277,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
if (req->flags & REQ_F_BUFFER_SELECT) {
if (msg->msg_iovlen == 0) {
- sr->len = iomsg->fast_iov[0].iov_len = 0;
- iomsg->fast_iov[0].iov_base = NULL;
- iomsg->free_iov = NULL;
+ sr->len = iov->iov_len = 0;
+ iov->iov_base = NULL;
} else if (msg->msg_iovlen > 1) {
ret = -EINVAL;
goto ua_end;
@@ -266,10 +286,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
/* we only need the length for provided buffers */
if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))
goto ua_end;
- unsafe_get_user(iomsg->fast_iov[0].iov_len,
- &msg->msg_iov[0].iov_len, ua_end);
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
+ unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len,
+ ua_end);
+ sr->len = iov->iov_len;
}
ret = 0;
ua_end:
@@ -278,13 +297,12 @@ ua_end:
}
user_access_end();
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter, false);
+ ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs,
+ &iov, &iomsg->msg.msg_iter, false);
if (unlikely(ret < 0))
return ret;
- return 0;
+ return io_net_vec_assign(req, iomsg, iov);
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@@ -320,60 +338,58 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
return ret;
}
-int io_send_prep_async(struct io_kiocb *req)
+void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
{
- struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
- int ret;
+ struct io_async_msghdr *io = req->async_data;
- if (req_has_async_data(req))
- return 0;
- zc->done_io = 0;
- if (!zc->addr)
- return 0;
- io = io_msg_alloc_async_prep(req);
- if (!io)
- return -ENOMEM;
- ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
- return ret;
+ io_netmsg_iovec_free(io);
}
-static int io_setup_async_addr(struct io_kiocb *req,
- struct sockaddr_storage *addr_storage,
- unsigned int issue_flags)
+static int io_send_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
+ struct io_async_msghdr *kmsg = req->async_data;
+ int ret;
- if (!sr->addr || req_has_async_data(req))
- return -EAGAIN;
- io = io_msg_alloc_async(req, issue_flags);
- if (!io)
- return -ENOMEM;
- memcpy(&io->addr, addr_storage, sizeof(io->addr));
- return -EAGAIN;
+ kmsg->msg.msg_name = NULL;
+ kmsg->msg.msg_namelen = 0;
+ kmsg->msg.msg_control = NULL;
+ kmsg->msg.msg_controllen = 0;
+ kmsg->msg.msg_ubuf = NULL;
+
+ if (sr->addr) {
+ ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr);
+ if (unlikely(ret < 0))
+ return ret;
+ kmsg->msg.msg_name = &kmsg->addr;
+ kmsg->msg.msg_namelen = sr->addr_len;
+ }
+ if (!io_do_buffer_select(req)) {
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+ return 0;
}
-int io_sendmsg_prep_async(struct io_kiocb *req)
+static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg;
int ret;
- sr->done_io = 0;
- if (!io_msg_alloc_async_prep(req))
+ kmsg = io_msg_alloc_async(req);
+ if (unlikely(!kmsg))
return -ENOMEM;
- ret = io_sendmsg_copy_hdr(req, req->async_data);
+ if (!is_msg)
+ return io_send_setup(req);
+ ret = io_sendmsg_copy_hdr(req, kmsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
-{
- struct io_async_msghdr *io = req->async_data;
-
- kfree(io->free_iov);
-}
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -393,34 +409,114 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
- if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ if (sr->flags & ~SENDMSG_FLAGS)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ if (req->opcode == IORING_OP_SENDMSG)
+ return -EINVAL;
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ sr->msg_flags |= MSG_WAITALL;
+ sr->buf_group = req->buf_index;
+ req->buf_list = NULL;
+ }
+ if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
+ return -EINVAL;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return 0;
+ return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG);
}
static void io_req_msg_cleanup(struct io_kiocb *req,
- struct io_async_msghdr *kmsg,
unsigned int issue_flags)
{
req->flags &= ~REQ_F_NEED_CLEANUP;
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
io_netmsg_recycle(req, issue_flags);
}
+/*
+ * For bundle completions, we need to figure out how many segments we consumed.
+ * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
+ * could be using an ITER_IOVEC. If the latter, then if we consumed all of
+ * the segments, then it's a trivial questiont o answer. If we have residual
+ * data in the iter, then loop the segments to figure out how much we
+ * transferred.
+ */
+static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
+{
+ struct iovec *iov;
+ int nbufs;
+
+ /* no data is always zero segments, and a ubuf is always 1 segment */
+ if (ret <= 0)
+ return 0;
+ if (iter_is_ubuf(&kmsg->msg.msg_iter))
+ return 1;
+
+ iov = kmsg->free_iov;
+ if (!iov)
+ iov = &kmsg->fast_iov;
+
+ /* if all data was transferred, it's basic pointer math */
+ if (!iov_iter_count(&kmsg->msg.msg_iter))
+ return iter_iov(&kmsg->msg.msg_iter) - iov;
+
+ /* short transfer, count segments */
+ nbufs = 0;
+ do {
+ int this_len = min_t(int, iov[nbufs].iov_len, ret);
+
+ nbufs++;
+ ret -= this_len;
+ } while (ret);
+
+ return nbufs;
+}
+
+static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+ struct io_async_msghdr *kmsg,
+ unsigned issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ bool bundle_finished = *ret <= 0;
+ unsigned int cflags;
+
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
+ cflags = io_put_kbuf(req, issue_flags);
+ goto finish;
+ }
+
+ cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+
+ if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+
+ /*
+ * Fill CQE for this receive and see if we should keep trying to
+ * receive from this socket.
+ */
+ if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
+ io_mshot_prep_retry(req, kmsg);
+ return false;
+ }
+
+ /* Otherwise stop bundle and use the current result. */
+finish:
+ io_req_set_res(req, *ret, cflags);
+ *ret = IOU_OK;
+ return true;
+}
+
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -430,19 +526,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- kmsg->msg.msg_control_user = sr->msg_control;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
@@ -450,23 +536,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+ kmsg->msg.msg_control_user = sr->msg_control;
+
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
kmsg->msg.msg_controllen = 0;
kmsg->msg.msg_control = NULL;
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
- io_req_msg_cleanup(req, kmsg, issue_flags);
+ io_req_msg_cleanup(req, issue_flags);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
@@ -477,65 +565,77 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
- struct sockaddr_storage __address;
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int min_ret = 0;
int ret;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
- msg.msg_ubuf = NULL;
-
- if (sr->addr) {
- if (req_has_async_data(req)) {
- struct io_async_msghdr *io = req->async_data;
-
- msg.msg_name = &io->addr;
- } else {
- ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
- if (unlikely(ret < 0))
- return ret;
- msg.msg_name = (struct sockaddr *)&__address;
- }
- msg.msg_namelen = sr->addr_len;
- }
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_addr(req, &__address, issue_flags);
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
- if (unlikely(ret))
- return ret;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+
+retry_bundle:
+ if (io_do_buffer_select(req)) {
+ struct buf_sel_arg arg = {
+ .iovs = &kmsg->fast_iov,
+ .max_len = INT_MAX,
+ .nr_iovs = 1,
+ .mode = KBUF_MODE_EXPAND,
+ };
+
+ if (kmsg->free_iov) {
+ arg.nr_iovs = kmsg->free_iov_nr;
+ arg.iovs = kmsg->free_iov;
+ arg.mode |= KBUF_MODE_FREE;
+ }
+
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE))
+ arg.nr_iovs = 1;
+
+ ret = io_buffers_select(req, &arg, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+
+ sr->len = arg.out_len;
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
+ arg.out_len);
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+ kmsg->free_iov_nr = ret;
+ kmsg->free_iov = arg.iovs;
+ }
+ }
+
+ /*
+ * If MSG_WAITALL is set, or this is a bundle send, then we need
+ * the full amount. If just bundle is set, if we do a short send
+ * then we complete the bundle sequence rather than continue on.
+ */
+ if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
+ kmsg->msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &kmsg->msg);
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -545,8 +645,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- io_req_set_res(req, ret, 0);
- return IOU_OK;
+
+ if (!io_send_finish(req, &ret, kmsg, issue_flags))
+ goto retry_bundle;
+
+ io_req_msg_cleanup(req, issue_flags);
+ return ret;
}
static int io_recvmsg_mshot_prep(struct io_kiocb *req,
@@ -611,23 +715,42 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
msg.msg_controllen);
}
-int io_recvmsg_prep_async(struct io_kiocb *req)
+static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *iomsg;
+ struct io_async_msghdr *kmsg;
int ret;
- sr->done_io = 0;
- if (!io_msg_alloc_async_prep(req))
+ kmsg = io_msg_alloc_async(req);
+ if (unlikely(!kmsg))
return -ENOMEM;
- iomsg = req->async_data;
- ret = io_recvmsg_copy_hdr(req, iomsg);
+
+ if (req->opcode == IORING_OP_RECV) {
+ kmsg->msg.msg_name = NULL;
+ kmsg->msg.msg_namelen = 0;
+ kmsg->msg.msg_control = NULL;
+ kmsg->msg.msg_get_inq = 1;
+ kmsg->msg.msg_controllen = 0;
+ kmsg->msg.msg_iocb = NULL;
+ kmsg->msg.msg_ubuf = NULL;
+
+ if (!io_do_buffer_select(req)) {
+ ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
+ }
+
+ ret = io_recvmsg_copy_hdr(req, kmsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
+ IORING_RECVSEND_BUNDLE)
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -641,21 +764,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
- if (sr->flags & ~(RECVMSG_FLAGS))
+ if (sr->flags & ~RECVMSG_FLAGS)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE)
req->flags |= REQ_F_CLEAR_POLLIN;
- if (sr->flags & IORING_RECV_MULTISHOT) {
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
- if (sr->msg_flags & MSG_WAITALL)
- return -EINVAL;
- if (req->opcode == IORING_OP_RECV && sr->len)
- return -EINVAL;
- req->flags |= REQ_F_APOLL_MULTISHOT;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
/*
* Store the buffer group for this multishot receive separately,
* as if we end up doing an io-wq based issue that selects a
@@ -665,6 +781,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* restore it.
*/
sr->buf_group = req->buf_index;
+ req->buf_list = NULL;
+ }
+ if (sr->flags & IORING_RECV_MULTISHOT) {
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ if (sr->msg_flags & MSG_WAITALL)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_RECV && sr->len)
+ return -EINVAL;
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ }
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ if (req->opcode == IORING_OP_RECVMSG)
+ return -EINVAL;
}
#ifdef CONFIG_COMPAT
@@ -672,17 +802,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
sr->nr_multishot_loops = 0;
- return 0;
-}
-
-static inline void io_recv_prep_retry(struct io_kiocb *req)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-
- req->flags &= ~REQ_F_BL_EMPTY;
- sr->done_io = 0;
- sr->len = 0; /* get from the provided buffer */
- req->buf_index = sr->buf_group;
+ return io_recvmsg_prep_setup(req);
}
/*
@@ -692,28 +812,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
* again (for multishot).
*/
static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
- struct msghdr *msg, bool mshot_finished,
- unsigned issue_flags)
+ struct io_async_msghdr *kmsg,
+ bool mshot_finished, unsigned issue_flags)
{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
unsigned int cflags;
- cflags = io_put_kbuf(req, issue_flags);
- if (msg->msg_inq > 0)
+ if (sr->flags & IORING_RECVSEND_BUNDLE)
+ cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+ issue_flags);
+ else
+ cflags = io_put_kbuf(req, issue_flags);
+
+ if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ /* bundle with no more immediate buffers, we're done */
+ if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+
/*
* Fill CQE for this receive and see if we should keep trying to
* receive from this socket.
*/
if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
- io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- *ret, cflags | IORING_CQE_F_MORE)) {
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
- io_recv_prep_retry(req);
+ io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
- if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) {
+ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
return false;
/* mshot retries exceeded, force a requeue */
@@ -728,12 +856,14 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
}
/* Finish the request / stop multishot. */
+finish:
io_req_set_res(req, *ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_STOP_MULTISHOT;
else
*ret = IOU_OK;
+ io_req_msg_cleanup(req, issue_flags);
return true;
}
@@ -824,7 +954,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
@@ -835,18 +965,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_recvmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
flags = sr->msg_flags;
if (force_nonblock)
@@ -888,17 +1009,16 @@ retry_multishot:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- ret = io_setup_async_msg(req, kmsg, issue_flags);
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
+ if (issue_flags & IO_URING_F_MULTISHOT) {
io_kbuf_recycle(req, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
- return ret;
+ return -EAGAIN;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -914,21 +1034,79 @@ retry_multishot:
else
io_kbuf_recycle(req, issue_flags);
- if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
+ if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
goto retry_multishot;
- if (mshot_finished)
- io_req_msg_cleanup(req, kmsg, issue_flags);
- else if (ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg, issue_flags);
-
return ret;
}
+static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
+ size_t *len, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ /*
+ * If the ring isn't locked, then don't use the peek interface
+ * to grab multiple buffers as we will lock/unlock between
+ * this selection and posting the buffers.
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ sr->flags & IORING_RECVSEND_BUNDLE) {
+ struct buf_sel_arg arg = {
+ .iovs = &kmsg->fast_iov,
+ .nr_iovs = 1,
+ .mode = KBUF_MODE_EXPAND,
+ };
+
+ if (kmsg->free_iov) {
+ arg.nr_iovs = kmsg->free_iov_nr;
+ arg.iovs = kmsg->free_iov;
+ arg.mode |= KBUF_MODE_FREE;
+ }
+
+ if (kmsg->msg.msg_inq > 0)
+ arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
+
+ ret = io_buffers_peek(req, &arg);
+ if (unlikely(ret < 0))
+ return ret;
+
+ /* special case 1 vec, can be a fast path */
+ if (ret == 1) {
+ sr->buf = arg.iovs[0].iov_base;
+ sr->len = arg.iovs[0].iov_len;
+ goto map_ubuf;
+ }
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
+ arg.out_len);
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+ kmsg->free_iov_nr = ret;
+ kmsg->free_iov = arg.iovs;
+ }
+ } else {
+ void __user *buf;
+
+ *len = sr->len;
+ buf = io_buffer_select(req, len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
+ sr->len = *len;
+map_ubuf:
+ ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
+}
+
int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
@@ -943,40 +1121,25 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_get_inq = 1;
- msg.msg_controllen = 0;
- msg.msg_iocb = NULL;
- msg.msg_ubuf = NULL;
-
flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
retry_multishot:
if (io_do_buffer_select(req)) {
- void __user *buf;
-
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- sr->buf = buf;
- sr->len = len;
+ ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
+ if (unlikely(ret))
+ goto out_free;
+ sr->buf = NULL;
}
- ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
- if (unlikely(ret))
- goto out_free;
-
- msg.msg_inq = -1;
- msg.msg_flags = 0;
+ kmsg->msg.msg_inq = -1;
+ kmsg->msg.msg_flags = 0;
if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- ret = sock_recvmsg(sock, &msg, flags);
+ ret = sock_recvmsg(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
if (issue_flags & IO_URING_F_MULTISHOT) {
@@ -996,7 +1159,7 @@ retry_multishot:
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
@@ -1008,7 +1171,7 @@ out_free:
else
io_kbuf_recycle(req, issue_flags);
- if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
+ if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags))
goto retry_multishot;
return ret;
@@ -1017,14 +1180,10 @@ out_free:
void io_send_zc_cleanup(struct io_kiocb *req)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
+ struct io_async_msghdr *io = req->async_data;
- if (req_has_async_data(req)) {
- io = req->async_data;
- /* might be ->fast_iov if *msg_copy_hdr failed */
- if (io->free_iov != io->fast_iov)
- kfree(io->free_iov);
- }
+ if (req_has_async_data(req))
+ io_netmsg_iovec_free(io);
if (zc->notif) {
io_notif_flush(zc->notif);
zc->notif = NULL;
@@ -1041,6 +1200,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *notif;
zc->done_io = 0;
+ req->flags |= REQ_F_POLL_NO_LAZY;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
@@ -1061,8 +1221,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (zc->flags & ~IO_ZC_FLAGS_VALID)
return -EINVAL;
if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
- io_notif_set_extended(notif);
- io_notif_to_data(notif)->zc_report = true;
+ struct io_notif_data *nd = io_notif_to_data(notif);
+
+ nd->zc_report = true;
+ nd->zc_used = false;
+ nd->zc_copied = false;
}
}
@@ -1090,7 +1253,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
zc->len = READ_ONCE(sqe->len);
- zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -1098,7 +1261,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
zc->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return 0;
+ return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC);
}
static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
@@ -1159,11 +1322,34 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
return ret;
}
+static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+ ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu,
+ (u64)(uintptr_t)sr->buf, sr->len);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter;
+ } else {
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ ret = io_notif_account_mem(sr->notif, sr->len);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+ }
+
+ return ret;
+}
+
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
{
- struct sockaddr_storage __address;
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned msg_flags;
int ret, min_ret = 0;
@@ -1174,67 +1360,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
-
- if (zc->addr) {
- if (req_has_async_data(req)) {
- struct io_async_msghdr *io = req->async_data;
-
- msg.msg_name = &io->addr;
- } else {
- ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
- if (unlikely(ret < 0))
- return ret;
- msg.msg_name = (struct sockaddr *)&__address;
- }
- msg.msg_namelen = zc->addr_len;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(zc->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
- if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
- ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
+ if (!zc->done_io) {
+ ret = io_send_zc_import(req, kmsg);
if (unlikely(ret))
return ret;
- msg.sg_from_iter = io_sg_from_iter;
- } else {
- io_notif_set_extended(zc->notif);
- ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
- if (unlikely(ret))
- return ret;
- ret = io_notif_account_mem(zc->notif, zc->len);
- if (unlikely(ret))
- return ret;
- msg.sg_from_iter = io_sg_from_iter_iovec;
}
- msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+ msg_flags = zc->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
- msg.msg_flags = msg_flags;
- msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
- ret = sock_sendmsg(sock, &msg);
+ kmsg->msg.msg_flags = msg_flags;
+ kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
+ ret = sock_sendmsg(sock, &kmsg->msg);
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
- if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
+ if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
zc->len -= ret;
zc->buf += ret;
zc->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1252,7 +1408,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(zc->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
@@ -1261,63 +1417,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
- io_notif_set_extended(sr->notif);
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- kmsg->msg.msg_control_user = sr->msg_control;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
- flags = sr->msg_flags | MSG_ZEROCOPY;
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+ kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov) {
- kfree(kmsg->free_iov);
- kmsg->free_iov = NULL;
- }
- io_netmsg_recycle(req, issue_flags);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
@@ -1329,7 +1468,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(sr->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
@@ -1429,8 +1568,7 @@ retry:
if (ret < 0)
return ret;
- if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- ret, IORING_CQE_F_MORE))
+ if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE))
goto retry;
io_req_set_res(req, ret, 0);
@@ -1491,17 +1629,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-int io_connect_prep_async(struct io_kiocb *req)
-{
- struct io_async_connect *io = req->async_data;
- struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
-
- return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
-}
-
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
+ struct io_async_msghdr *io;
if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
@@ -1509,32 +1640,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
conn->in_progress = conn->seen_econnaborted = false;
- return 0;
+
+ io = io_msg_alloc_async(req);
+ if (unlikely(!io))
+ return -ENOMEM;
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
}
int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
- struct io_async_connect __io, *io;
+ struct io_async_msghdr *io = req->async_data;
unsigned file_flags;
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req_has_async_data(req)) {
- io = req->async_data;
- } else {
- ret = move_addr_to_kernel(connect->addr,
- connect->addr_len,
- &__io.address);
- if (ret)
- goto out;
- io = &__io;
- }
-
file_flags = force_nonblock ? O_NONBLOCK : 0;
- ret = __sys_connect_file(req->file, &io->address,
- connect->addr_len, file_flags);
+ ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
+ file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
&& force_nonblock) {
if (ret == -EINPROGRESS) {
@@ -1544,13 +1669,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
goto out;
connect->seen_econnaborted = true;
}
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (connect->in_progress) {
@@ -1568,12 +1686,20 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
out:
if (ret < 0)
req_set_fail(req);
+ io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
-void io_netmsg_cache_free(struct io_cache_entry *entry)
+void io_netmsg_cache_free(const void *entry)
{
- kfree(container_of(entry, struct io_async_msghdr, cache));
+ struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
+
+ if (kmsg->free_iov) {
+ kasan_mempool_unpoison_object(kmsg->free_iov,
+ kmsg->free_iov_nr * sizeof(struct iovec));
+ io_netmsg_iovec_free(kmsg);
+ }
+ kfree(kmsg);
}
#endif
diff --git a/io_uring/net.h b/io_uring/net.h
index 191009979bcb08..0eb1c1920fc9a9 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,22 +3,15 @@
#include <linux/net.h>
#include <linux/uio.h>
-#include "alloc_cache.h"
-
struct io_async_msghdr {
#if defined(CONFIG_NET)
- union {
- struct iovec fast_iov[UIO_FASTIOV];
- struct {
- struct iovec fast_iov_one;
- __kernel_size_t controllen;
- int namelen;
- __kernel_size_t payloadlen;
- };
- struct io_cache_entry cache;
- };
+ struct iovec fast_iov;
/* points to an allocated iov, if NULL we use fast_iov instead */
struct iovec *free_iov;
+ int free_iov_nr;
+ int namelen;
+ __kernel_size_t controllen;
+ __kernel_size_t payloadlen;
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
@@ -27,22 +20,15 @@ struct io_async_msghdr {
#if defined(CONFIG_NET)
-struct io_async_connect {
- struct sockaddr_storage address;
-};
-
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_shutdown(struct io_kiocb *req, unsigned int issue_flags);
-int io_sendmsg_prep_async(struct io_kiocb *req);
void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req);
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_send(struct io_kiocb *req, unsigned int issue_flags);
-int io_send_prep_async(struct io_kiocb *req);
-int io_recvmsg_prep_async(struct io_kiocb *req);
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_recv(struct io_kiocb *req, unsigned int issue_flags);
@@ -55,7 +41,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags);
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_socket(struct io_kiocb *req, unsigned int issue_flags);
-int io_connect_prep_async(struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
@@ -64,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);
-void io_netmsg_cache_free(struct io_cache_entry *entry);
+void io_netmsg_cache_free(const void *entry);
#else
-static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
+static inline void io_netmsg_cache_free(const void *entry)
{
}
#endif
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 204f0fc7d96634..28859ae3ee6eb9 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -9,35 +9,36 @@
#include "notif.h"
#include "rsrc.h"
-static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts)
+static const struct ubuf_info_ops io_ubuf_ops;
+
+static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts)
{
struct io_notif_data *nd = io_notif_to_data(notif);
- struct io_ring_ctx *ctx = notif->ctx;
- if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
- notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+ do {
+ notif = cmd_to_io_kiocb(nd);
- if (nd->account_pages && ctx->user) {
- __io_unaccount_mem(ctx->user, nd->account_pages);
- nd->account_pages = 0;
- }
- io_req_task_complete(notif, ts);
-}
+ lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0);
-static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success)
-{
- struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
- struct io_kiocb *notif = cmd_to_io_kiocb(nd);
+ if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used))
+ notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+
+ if (nd->account_pages && notif->ctx->user) {
+ __io_unaccount_mem(notif->ctx->user, nd->account_pages);
+ nd->account_pages = 0;
+ }
- if (refcount_dec_and_test(&uarg->refcnt))
- __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
+ nd = nd->next;
+ io_req_task_complete(notif, ts);
+ } while (nd);
}
-static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success)
+void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
+ struct io_kiocb *notif = cmd_to_io_kiocb(nd);
+ unsigned tw_flags;
if (nd->zc_report) {
if (success && !nd->zc_used && skb)
@@ -45,31 +46,64 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
else if (!success && !nd->zc_copied)
WRITE_ONCE(nd->zc_copied, true);
}
- io_tx_ubuf_complete(skb, uarg, success);
-}
-static const struct ubuf_info_ops io_ubuf_ops = {
- .complete = io_tx_ubuf_complete,
-};
+ if (!refcount_dec_and_test(&uarg->refcnt))
+ return;
-static const struct ubuf_info_ops io_ubuf_ops_ext = {
- .complete = io_tx_ubuf_callback_ext,
-};
+ if (nd->head != nd) {
+ io_tx_ubuf_complete(skb, &nd->head->uarg, success);
+ return;
+ }
+
+ tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE;
+ notif->io_task_work.func = io_notif_tw_complete;
+ __io_req_task_work_add(notif, tw_flags);
+}
-void io_notif_set_extended(struct io_kiocb *notif)
+static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
{
- struct io_notif_data *nd = io_notif_to_data(notif);
+ struct io_notif_data *nd, *prev_nd;
+ struct io_kiocb *prev_notif, *notif;
+ struct ubuf_info *prev_uarg = skb_zcopy(skb);
- if (nd->uarg.ops != &io_ubuf_ops_ext) {
- nd->account_pages = 0;
- nd->zc_report = false;
- nd->zc_used = false;
- nd->zc_copied = false;
- nd->uarg.ops = &io_ubuf_ops_ext;
- notif->io_task_work.func = io_notif_complete_tw_ext;
+ nd = container_of(uarg, struct io_notif_data, uarg);
+ notif = cmd_to_io_kiocb(nd);
+
+ if (!prev_uarg) {
+ net_zcopy_get(&nd->uarg);
+ skb_zcopy_init(skb, &nd->uarg);
+ return 0;
}
+ /* handle it separately as we can't link a notif to itself */
+ if (unlikely(prev_uarg == &nd->uarg))
+ return 0;
+ /* we can't join two links together, just request a fresh skb */
+ if (unlikely(nd->head != nd || nd->next))
+ return -EEXIST;
+ /* don't mix zc providers */
+ if (unlikely(prev_uarg->ops != &io_ubuf_ops))
+ return -EEXIST;
+
+ prev_nd = container_of(prev_uarg, struct io_notif_data, uarg);
+ prev_notif = cmd_to_io_kiocb(nd);
+
+ /* make sure all noifications can be finished in the same task_work */
+ if (unlikely(notif->ctx != prev_notif->ctx ||
+ notif->task != prev_notif->task))
+ return -EEXIST;
+
+ nd->head = prev_nd->head;
+ nd->next = prev_nd->next;
+ prev_nd->next = nd;
+ net_zcopy_get(&nd->head->uarg);
+ return 0;
}
+static const struct ubuf_info_ops io_ubuf_ops = {
+ .complete = io_tx_ubuf_complete,
+ .link_skb = io_link_skb,
+};
+
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
@@ -84,9 +118,13 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
notif->task = current;
io_get_task_refs(1);
notif->rsrc_node = NULL;
- notif->io_task_work.func = io_req_task_complete;
nd = io_notif_to_data(notif);
+ nd->zc_report = false;
+ nd->account_pages = 0;
+ nd->next = NULL;
+ nd->head = nd;
+
nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
nd->uarg.ops = &io_ubuf_ops;
refcount_set(&nd->uarg.refcnt, 1);
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 86d32bd9f85683..f3589cfef4a9c9 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -13,14 +13,19 @@
struct io_notif_data {
struct file *file;
struct ubuf_info uarg;
- unsigned long account_pages;
+
+ struct io_notif_data *next;
+ struct io_notif_data *head;
+
+ unsigned account_pages;
bool zc_report;
bool zc_used;
bool zc_copied;
};
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
-void io_notif_set_extended(struct io_kiocb *notif);
+void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success);
static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
{
@@ -32,9 +37,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
{
struct io_notif_data *nd = io_notif_to_data(notif);
- /* drop slot's master ref */
- if (refcount_dec_and_test(&nd->uarg.refcnt))
- __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
+ io_tx_ubuf_complete(NULL, &nd->uarg, true);
}
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9c080aadc5a662..2de5cca9504eb4 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -67,7 +67,8 @@ const struct io_issue_def io_issue_defs[] = {
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
- .prep = io_prep_rwv,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_readv,
.issue = io_read,
},
[IORING_OP_WRITEV] = {
@@ -81,7 +82,8 @@ const struct io_issue_def io_issue_defs[] = {
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
- .prep = io_prep_rwv,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_writev,
.issue = io_write,
},
[IORING_OP_FSYNC] = {
@@ -99,7 +101,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw_fixed,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_read_fixed,
.issue = io_read,
},
[IORING_OP_WRITE_FIXED] = {
@@ -112,7 +115,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw_fixed,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_write_fixed,
.issue = io_write,
},
[IORING_OP_POLL_ADD] = {
@@ -138,8 +142,8 @@ const struct io_issue_def io_issue_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.ioprio = 1,
- .manual_alloc = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_sendmsg,
#else
@@ -152,8 +156,8 @@ const struct io_issue_def io_issue_defs[] = {
.pollin = 1,
.buffer_select = 1,
.ioprio = 1,
- .manual_alloc = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep,
.issue = io_recvmsg,
#else
@@ -162,6 +166,7 @@ const struct io_issue_def io_issue_defs[] = {
},
[IORING_OP_TIMEOUT] = {
.audit_skip = 1,
+ .async_size = sizeof(struct io_timeout_data),
.prep = io_timeout_prep,
.issue = io_timeout,
},
@@ -191,6 +196,7 @@ const struct io_issue_def io_issue_defs[] = {
},
[IORING_OP_LINK_TIMEOUT] = {
.audit_skip = 1,
+ .async_size = sizeof(struct io_timeout_data),
.prep = io_link_timeout_prep,
.issue = io_no_issue,
},
@@ -199,6 +205,7 @@ const struct io_issue_def io_issue_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_connect_prep,
.issue = io_connect,
#else
@@ -239,7 +246,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_read,
.issue = io_read,
},
[IORING_OP_WRITE] = {
@@ -252,7 +260,8 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .prep = io_prep_rw,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_write,
.issue = io_write,
},
[IORING_OP_FADVISE] = {
@@ -272,8 +281,9 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.audit_skip = 1,
.ioprio = 1,
- .manual_alloc = 1,
+ .buffer_select = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_send,
#else
@@ -288,6 +298,7 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep,
.issue = io_recv,
#else
@@ -403,6 +414,7 @@ const struct io_issue_def io_issue_defs[] = {
.plug = 1,
.iopoll = 1,
.iopoll_queue = 1,
+ .async_size = 2 * sizeof(struct io_uring_sqe),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
},
@@ -412,8 +424,8 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.audit_skip = 1,
.ioprio = 1,
- .manual_alloc = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
.issue = io_send_zc,
#else
@@ -425,8 +437,8 @@ const struct io_issue_def io_issue_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.ioprio = 1,
- .manual_alloc = 1,
#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
.issue = io_sendmsg_zc,
#else
@@ -439,10 +451,12 @@ const struct io_issue_def io_issue_defs[] = {
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
+ .async_size = sizeof(struct io_async_rw),
.prep = io_read_mshot_prep,
.issue = io_read_mshot,
},
[IORING_OP_WAITID] = {
+ .async_size = sizeof(struct io_waitid_async),
.prep = io_waitid_prep,
.issue = io_waitid,
},
@@ -488,16 +502,12 @@ const struct io_cold_def io_cold_defs[] = {
.name = "NOP",
},
[IORING_OP_READV] = {
- .async_size = sizeof(struct io_async_rw),
.name = "READV",
- .prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITEV] = {
- .async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
- .prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
@@ -505,12 +515,10 @@ const struct io_cold_def io_cold_defs[] = {
.name = "FSYNC",
},
[IORING_OP_READ_FIXED] = {
- .async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.fail = io_rw_fail,
},
[IORING_OP_WRITE_FIXED] = {
- .async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.fail = io_rw_fail,
},
@@ -526,8 +534,6 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_SENDMSG] = {
.name = "SENDMSG",
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
- .prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
@@ -535,14 +541,11 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_RECVMSG] = {
.name = "RECVMSG",
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
- .prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_TIMEOUT] = {
- .async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
},
[IORING_OP_TIMEOUT_REMOVE] = {
@@ -555,15 +558,10 @@ const struct io_cold_def io_cold_defs[] = {
.name = "ASYNC_CANCEL",
},
[IORING_OP_LINK_TIMEOUT] = {
- .async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
},
[IORING_OP_CONNECT] = {
.name = "CONNECT",
-#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_connect),
- .prep_async = io_connect_prep_async,
-#endif
},
[IORING_OP_FALLOCATE] = {
.name = "FALLOCATE",
@@ -583,12 +581,10 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
- .async_size = sizeof(struct io_async_rw),
.name = "READ",
.fail = io_rw_fail,
},
[IORING_OP_WRITE] = {
- .async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.fail = io_rw_fail,
},
@@ -601,14 +597,14 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_SEND] = {
.name = "SEND",
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
+ .cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
- .prep_async = io_send_prep_async,
#endif
},
[IORING_OP_RECV] = {
.name = "RECV",
#if defined(CONFIG_NET)
+ .cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
@@ -679,14 +675,10 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
- .async_size = 2 * sizeof(struct io_uring_sqe),
- .prep_async = io_uring_cmd_prep_async,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
- .prep_async = io_send_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#endif
@@ -694,8 +686,6 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_SENDMSG_ZC] = {
.name = "SENDMSG_ZC",
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
- .prep_async = io_sendmsg_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#endif
@@ -705,7 +695,6 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_WAITID] = {
.name = "WAITID",
- .async_size = sizeof(struct io_waitid_async),
},
[IORING_OP_FUTEX_WAIT] = {
.name = "FUTEX_WAIT",
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 9e5435ec27d00f..7ee6f5aa90aa3c 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -27,22 +27,19 @@ struct io_issue_def {
unsigned iopoll : 1;
/* have to be put into the iopoll list */
unsigned iopoll_queue : 1;
- /* opcode specific path will handle ->async_data allocation if needed */
- unsigned manual_alloc : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
+ /* size of async data needed, if any */
+ unsigned short async_size;
+
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
};
struct io_cold_def {
- /* size of async data needed, if any */
- unsigned short async_size;
-
const char *name;
- int (*prep_async)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
void (*fail)(struct io_kiocb *);
};
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 6db1dcb2c79776..0a8e02944689fa 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -14,6 +14,7 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "alloc_cache.h"
#include "refs.h"
#include "napi.h"
#include "opdef.h"
@@ -322,8 +323,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
__poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events);
- if (!io_fill_cqe_req_aux(req, ts->locked, mask,
- IORING_CQE_F_MORE)) {
+ if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) {
io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES;
}
@@ -687,17 +687,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_cache_entry *entry;
struct async_poll *apoll;
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- entry = io_alloc_cache_get(&ctx->apoll_cache);
- if (entry == NULL)
+ apoll = io_alloc_cache_get(&ctx->apoll_cache);
+ if (!apoll)
goto alloc_apoll;
- apoll = container_of(entry, struct async_poll, cache);
apoll->poll.retries = APOLL_MAX_RETRY;
} else {
alloc_apoll:
@@ -1056,8 +1054,3 @@ out:
io_req_set_res(req, ret, 0);
return IOU_OK;
}
-
-void io_apoll_cache_free(struct io_cache_entry *entry)
-{
- kfree(container_of(entry, struct async_poll, cache));
-}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 1dacae9e816c92..b0e3745f5a29b9 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "alloc_cache.h"
+#define IO_POLL_ALLOC_CACHE_MAX 32
enum {
IO_APOLL_OK,
@@ -17,10 +17,7 @@ struct io_poll {
};
struct async_poll {
- union {
- struct io_poll poll;
- struct io_cache_entry cache;
- };
+ struct io_poll poll;
struct io_poll *double_poll;
};
@@ -46,6 +43,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all);
-void io_apoll_cache_free(struct io_cache_entry *entry);
-
void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/refs.h b/io_uring/refs.h
index 1336de3f2a30aa..63982ead9f7dab 100644
--- a/io_uring/refs.h
+++ b/io_uring/refs.h
@@ -33,6 +33,13 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void req_ref_put(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ atomic_dec(&req->refs);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
diff --git a/io_uring/register.c b/io_uring/register.c
index 99c37775f974c0..ef8c908346a4ef 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -368,8 +368,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
/* now propagate the restriction to all registered users */
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
+ tctx = node->task->io_uring;
if (WARN_ON_ONCE(!tctx->io_wq))
continue;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4818b79231ddb0..65417c9553b1d8 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,8 +13,10 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "alloc_cache.h"
#include "openclose.h"
#include "rsrc.h"
+#include "memmap.h"
struct io_rsrc_update {
struct file *file;
@@ -169,7 +171,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+ if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
kfree(node);
}
@@ -197,12 +199,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
struct io_rsrc_node *ref_node;
- struct io_cache_entry *entry;
- entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
- if (entry) {
- ref_node = container_of(entry, struct io_rsrc_node, cache);
- } else {
+ ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+ if (!ref_node) {
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
return NULL;
@@ -872,42 +871,6 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
-{
- unsigned long start, end, nr_pages;
- struct page **pages = NULL;
- int ret;
-
- end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
- WARN_ON(!nr_pages);
-
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- mmap_read_lock(current->mm);
- ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
- mmap_read_unlock(current->mm);
-
- /* success, mapped all pages */
- if (ret == nr_pages) {
- *npages = nr_pages;
- return pages;
- }
-
- /* partial map, or didn't map anything */
- if (ret >= 0) {
- /* if we did partial map, release any pages we did get */
- if (ret)
- unpin_user_pages(pages, ret);
- ret = -EFAULT;
- }
- kvfree(pages);
- return ERR_PTR(ret);
-}
-
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage)
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index e210002389540b..c032ca3436ca99 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -2,8 +2,6 @@
#ifndef IOU_RSRC_H
#define IOU_RSRC_H
-#include "alloc_cache.h"
-
#define IO_NODE_ALLOC_CACHE_MAX 32
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
@@ -36,10 +34,7 @@ struct io_rsrc_data {
};
struct io_rsrc_node {
- union {
- struct io_cache_entry cache;
- struct io_ring_ctx *ctx;
- };
+ struct io_ring_ctx *ctx;
int refs;
bool empty;
u16 type;
@@ -88,12 +83,6 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
io_rsrc_node_ref_zero(node);
}
-static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
-{
- io_put_rsrc_node(ctx, req->rsrc_node);
-}
-
static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
struct io_rsrc_node *node)
{
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d517a1a2927493..1a2128459cb4c8 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -18,6 +18,7 @@
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
+#include "alloc_cache.h"
#include "rsrc.h"
#include "poll.h"
#include "rw.h"
@@ -75,7 +76,179 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
return 0;
}
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_import_iovec(int ddir, struct io_kiocb *req,
+ struct io_async_rw *io,
+ unsigned int issue_flags)
+{
+ const struct io_issue_def *def = &io_issue_defs[req->opcode];
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct iovec *iov;
+ void __user *buf;
+ int nr_segs, ret;
+ size_t sqe_len;
+
+ buf = u64_to_user_ptr(rw->addr);
+ sqe_len = rw->len;
+
+ if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
+ if (io_do_buffer_select(req)) {
+ buf = io_buffer_select(req, &sqe_len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ rw->addr = (unsigned long) buf;
+ rw->len = sqe_len;
+ }
+
+ return import_ubuf(ddir, buf, sqe_len, &io->iter);
+ }
+
+ if (io->free_iovec) {
+ nr_segs = io->free_iov_nr;
+ iov = io->free_iovec;
+ } else {
+ iov = &io->fast_iov;
+ nr_segs = 1;
+ }
+ ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter,
+ req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ret;
+ if (iov) {
+ req->flags |= REQ_F_NEED_CLEANUP;
+ io->free_iov_nr = io->iter.nr_segs;
+ kfree(io->free_iovec);
+ io->free_iovec = iov;
+ }
+ return 0;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_async_rw *io,
+ unsigned int issue_flags)
+{
+ int ret;
+
+ ret = __io_import_iovec(rw, req, io, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ return 0;
+}
+
+static void io_rw_iovec_free(struct io_async_rw *rw)
+{
+ if (rw->free_iovec) {
+ kfree(rw->free_iovec);
+ rw->free_iov_nr = 0;
+ rw->free_iovec = NULL;
+ }
+}
+
+static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_async_rw *rw = req->async_data;
+ struct iovec *iov;
+
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_rw_iovec_free(rw);
+ return;
+ }
+ iov = rw->free_iovec;
+ if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
+ if (iov)
+ kasan_mempool_poison_object(iov);
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+ }
+}
+
+static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
+{
+ /*
+ * Disable quick recycling for anything that's gone through io-wq.
+ * In theory, this should be fine to cleanup. However, some read or
+ * write iter handling touches the iovec AFTER having called into the
+ * handler, eg to reexpand or revert. This means we can have:
+ *
+ * task io-wq
+ * issue
+ * punt to io-wq
+ * issue
+ * blkdev_write_iter()
+ * ->ki_complete()
+ * io_complete_rw()
+ * queue tw complete
+ * run tw
+ * req_rw_cleanup
+ * iov_iter_count() <- look at iov_iter again
+ *
+ * which can lead to a UAF. This is only possible for io-wq offload
+ * as the cleanup can run in parallel. As io-wq is not the fast path,
+ * just leave cleanup to the end.
+ *
+ * This is really a bug in the core code that does this, any issue
+ * path should assume that a successful (or -EIOCBQUEUED) return can
+ * mean that the underlying data can be gone at any time. But that
+ * should be fixed seperately, and then this check could be killed.
+ */
+ if (!(req->flags & REQ_F_REFCOUNT)) {
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_rw_recycle(req, issue_flags);
+ }
+}
+
+static int io_rw_alloc_async(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_async_rw *rw;
+
+ rw = io_alloc_cache_get(&ctx->rw_cache);
+ if (rw) {
+ if (rw->free_iovec) {
+ kasan_mempool_unpoison_object(rw->free_iovec,
+ rw->free_iov_nr * sizeof(struct iovec));
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = rw;
+ goto done;
+ }
+
+ if (!io_alloc_async_data(req)) {
+ rw = req->async_data;
+ rw->free_iovec = NULL;
+ rw->free_iov_nr = 0;
+done:
+ rw->bytes_done = 0;
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
+{
+ struct io_async_rw *rw;
+ int ret;
+
+ if (io_rw_alloc_async(req))
+ return -ENOMEM;
+
+ if (!do_import || io_do_buffer_select(req))
+ return 0;
+
+ rw = req->async_data;
+ ret = io_import_iovec(ddir, req, rw, 0);
+ if (unlikely(ret < 0))
+ return ret;
+
+ iov_iter_save_state(&rw->iter, &rw->iter_state);
+ return 0;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ int ddir, bool do_import)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned ioprio;
@@ -100,34 +273,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
- return 0;
+ return io_prep_rw_setup(req, ddir, do_import);
+}
+
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rw(req, sqe, ITER_DEST, true);
}
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ return io_prep_rw(req, sqe, ITER_SOURCE, true);
+}
+
+static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ int ddir)
+{
+ const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT);
int ret;
- ret = io_prep_rw(req, sqe);
+ ret = io_prep_rw(req, sqe, ddir, do_import);
if (unlikely(ret))
return ret;
+ if (do_import)
+ return 0;
/*
* Have to do this validation here, as this is in io_read() rw->len
* might have chanaged due to buffer selection
*/
- if (req->flags & REQ_F_BUFFER_SELECT)
- return io_iov_buffer_select_prep(req);
+ return io_iov_buffer_select_prep(req);
+}
- return 0;
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rwv(req, sqe, ITER_DEST);
+}
+
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rwv(req, sqe, ITER_SOURCE);
}
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ int ddir)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_async_rw *io;
u16 index;
int ret;
- ret = io_prep_rw(req, sqe);
+ ret = io_prep_rw(req, sqe, ddir, false);
if (unlikely(ret))
return ret;
@@ -136,7 +333,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
req->imu = ctx->user_bufs[index];
io_req_set_rsrc_node(req, ctx, 0);
- return 0;
+
+ io = req->async_data;
+ ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len);
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ return ret;
+}
+
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rw_fixed(req, sqe, ITER_DEST);
+}
+
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rw_fixed(req, sqe, ITER_SOURCE);
}
/*
@@ -152,7 +363,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
- ret = io_prep_rw(req, sqe);
+ ret = io_prep_rw(req, sqe, ITER_DEST, false);
if (unlikely(ret))
return ret;
@@ -165,9 +376,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
void io_readv_writev_cleanup(struct io_kiocb *req)
{
- struct io_async_rw *io = req->async_data;
-
- kfree(io->free_iovec);
+ io_rw_iovec_free(req->async_data);
}
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@@ -187,21 +396,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
return NULL;
}
-static void io_req_task_queue_reissue(struct io_kiocb *req)
-{
- req->io_task_work.func = io_queue_iowq;
- io_req_task_work_add(req);
-}
-
#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
- if (!req_has_async_data(req))
- return !io_req_prep_async(req);
- iov_iter_restore(&io->s.iter, &io->s.iter_state);
- return true;
+ iov_iter_restore(&io->iter, &io->iter_state);
}
static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -230,9 +430,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
return true;
}
#else
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
{
- return false;
}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
@@ -311,11 +510,10 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
io_req_io_end(req);
- if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
- unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
+ if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
+ req->cqe.flags |= io_put_kbuf(req, 0);
- req->cqe.flags |= io_put_kbuf(req, issue_flags);
- }
+ io_req_rw_cleanup(req, 0);
io_req_task_complete(req, ts);
}
@@ -396,6 +594,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
io_req_io_end(req);
io_req_set_res(req, final_ret,
io_put_kbuf(req, issue_flags));
+ io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
}
} else {
@@ -404,71 +603,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req))
- io_req_task_queue_reissue(req);
- else
- io_req_task_queue_fail(req, final_ret);
+ io_resubmit_prep(req);
+ return -EAGAIN;
}
return IOU_ISSUE_SKIP_COMPLETE;
}
-static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
- struct io_rw_state *s,
- unsigned int issue_flags)
-{
- struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- struct iov_iter *iter = &s->iter;
- u8 opcode = req->opcode;
- struct iovec *iovec;
- void __user *buf;
- size_t sqe_len;
- ssize_t ret;
-
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- buf = u64_to_user_ptr(rw->addr);
- sqe_len = rw->len;
-
- if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
- if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, issue_flags);
- if (!buf)
- return ERR_PTR(-ENOBUFS);
- rw->addr = (unsigned long) buf;
- rw->len = sqe_len;
- }
-
- ret = import_ubuf(ddir, buf, sqe_len, iter);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- iovec = s->fast_iov;
- ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
- req->ctx->compat);
- if (unlikely(ret < 0))
- return ERR_PTR(ret);
- return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct io_rw_state *s,
- unsigned int issue_flags)
-{
- *iovec = __io_import_iovec(rw, req, s, issue_flags);
- if (IS_ERR(*iovec))
- return PTR_ERR(*iovec);
-
- iov_iter_save_state(&s->iter, &s->iter_state);
- return 0;
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -540,89 +680,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
return ret;
}
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov, struct iov_iter *iter)
-{
- struct io_async_rw *io = req->async_data;
-
- memcpy(&io->s.iter, iter, sizeof(*iter));
- io->free_iovec = iovec;
- io->bytes_done = 0;
- /* can only be fixed buffers, no need to do anything */
- if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
- return;
- if (!iovec) {
- unsigned iov_off = 0;
-
- io->s.iter.__iov = io->s.fast_iov;
- if (iter->__iov != fast_iov) {
- iov_off = iter_iov(iter) - fast_iov;
- io->s.iter.__iov += iov_off;
- }
- if (io->s.fast_iov != fast_iov)
- memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
- sizeof(struct iovec) * iter->nr_segs);
- } else {
- req->flags |= REQ_F_NEED_CLEANUP;
- }
-}
-
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- struct io_rw_state *s, bool force)
-{
- if (!force && !io_cold_defs[req->opcode].prep_async)
- return 0;
- /* opcode type doesn't need async data */
- if (!io_cold_defs[req->opcode].async_size)
- return 0;
- if (!req_has_async_data(req)) {
- struct io_async_rw *iorw;
-
- if (io_alloc_async_data(req)) {
- kfree(iovec);
- return -ENOMEM;
- }
-
- io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
- iorw = req->async_data;
- /* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
- }
- return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
- struct io_async_rw *iorw = req->async_data;
- struct iovec *iov;
- int ret;
-
- iorw->bytes_done = 0;
- iorw->free_iovec = NULL;
-
- /* submission path, ->uring_lock should already be taken */
- ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
- if (unlikely(ret < 0))
- return ret;
-
- if (iov) {
- iorw->free_iovec = iov;
- req->flags |= REQ_F_NEED_CLEANUP;
- }
-
- return 0;
-}
-
-int io_readv_prep_async(struct io_kiocb *req)
-{
- return io_rw_prep_async(req, ITER_DEST);
-}
-
-int io_writev_prep_async(struct io_kiocb *req)
-{
- return io_rw_prep_async(req, ITER_SOURCE);
-}
-
/*
* This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
@@ -763,54 +820,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
+ struct io_async_rw *io = req->async_data;
struct kiocb *kiocb = &rw->kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct io_async_rw *io;
- ssize_t ret, ret2;
+ ssize_t ret;
loff_t *ppos;
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
+ if (io_do_buffer_select(req)) {
+ ret = io_import_iovec(ITER_DEST, req, io, issue_flags);
if (unlikely(ret < 0))
return ret;
- } else {
- io = req->async_data;
- s = &io->s;
-
- /*
- * Safe and required to re-import if we're using provided
- * buffers, as we dropped the selected one before retry.
- */
- if (io_do_buffer_select(req)) {
- ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- }
-
- /*
- * We come here from an earlier attempt, restore our state to
- * match in case it doesn't. It's cheap enough that we don't
- * need to make this conditional.
- */
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
}
+
ret = io_rw_init_file(req, FMODE_READ);
- if (unlikely(ret)) {
- kfree(iovec);
+ if (unlikely(ret))
return ret;
- }
- req->cqe.res = iov_iter_count(&s->iter);
+ req->cqe.res = iov_iter_count(&io->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req))) {
- ret = io_setup_async_rw(req, iovec, s, true);
- return ret ?: -EAGAIN;
- }
+ if (unlikely(!io_file_supports_nowait(req)))
+ return -EAGAIN;
kiocb->ki_flags |= IOCB_NOWAIT;
} else {
/* Ensure we clear previously set non-block flag */
@@ -820,20 +851,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
- if (unlikely(ret)) {
- kfree(iovec);
+ if (unlikely(ret))
return ret;
- }
- ret = io_iter_do_read(rw, &s->iter);
+ ret = io_iter_do_read(rw, &io->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
- /*
- * If we can poll, just do that. For a vectored read, we'll
- * need to copy state first.
- */
- if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
+ /* If we can poll, just do that. */
+ if (io_file_can_poll(req))
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -843,8 +869,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
goto done;
ret = 0;
} else if (ret == -EIOCBQUEUED) {
- if (iovec)
- kfree(iovec);
return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
@@ -857,21 +881,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(&s->iter, &s->iter_state);
-
- ret2 = io_setup_async_rw(req, iovec, s, true);
- iovec = NULL;
- if (ret2) {
- ret = ret > 0 ? ret : ret2;
- goto done;
- }
-
- io = req->async_data;
- s = &io->s;
- /*
- * Now use our persistent iterator and state, if we aren't already.
- * We've restored and mapped the iter to match.
- */
+ iov_iter_restore(&io->iter, &io->iter_state);
do {
/*
@@ -879,11 +889,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(&s->iter, ret);
- if (!iov_iter_count(&s->iter))
+ iov_iter_advance(&io->iter, ret);
+ if (!iov_iter_count(&io->iter))
break;
io->bytes_done += ret;
- iov_iter_save_state(&s->iter, &s->iter_state);
+ iov_iter_save_state(&io->iter, &io->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -891,24 +901,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
}
- req->cqe.res = iov_iter_count(&s->iter);
+ req->cqe.res = iov_iter_count(&io->iter);
/*
* Now retry read with the IOCB_WAITQ parts set in the iocb. If
* we get -EIOCBQUEUED, then we'll get a notification when the
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(rw, &s->iter);
+ ret = io_iter_do_read(rw, &io->iter);
if (ret == -EIOCBQUEUED)
return IOU_ISSUE_SKIP_COMPLETE;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(&s->iter, &s->iter_state);
+ iov_iter_restore(&io->iter, &io->iter_state);
} while (ret > 0);
done:
/* it's faster to check here then delegate to kfree */
- if (iovec)
- kfree(iovec);
return ret;
}
@@ -971,9 +979,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
cflags = io_put_kbuf(req, issue_flags);
rw->len = 0; /* similarly to above, reset len to 0 */
- if (io_fill_cqe_req_aux(req,
- issue_flags & IO_URING_F_COMPLETE_DEFER,
- ret, cflags | IORING_CQE_F_MORE)) {
+ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
if (issue_flags & IO_URING_F_MULTISHOT) {
/*
* Force retry, as we might have more data to
@@ -992,6 +998,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* multishot request, hitting overflow will terminate it.
*/
io_req_set_res(req, ret, cflags);
+ io_req_rw_cleanup(req, issue_flags);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
@@ -999,42 +1006,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
+ struct io_async_rw *io = req->async_data;
struct kiocb *kiocb = &rw->kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ssize_t ret, ret2;
loff_t *ppos;
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- } else {
- struct io_async_rw *io = req->async_data;
-
- s = &io->s;
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
- }
ret = io_rw_init_file(req, FMODE_WRITE);
- if (unlikely(ret)) {
- kfree(iovec);
+ if (unlikely(ret))
return ret;
- }
- req->cqe.res = iov_iter_count(&s->iter);
+ req->cqe.res = iov_iter_count(&io->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
if (unlikely(!io_file_supports_nowait(req)))
- goto copy_iov;
+ goto ret_eagain;
/* Check if we can support NOWAIT. */
if (!(kiocb->ki_flags & IOCB_DIRECT) &&
!(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
(req->flags & REQ_F_ISREG))
- goto copy_iov;
+ goto ret_eagain;
kiocb->ki_flags |= IOCB_NOWAIT;
} else {
@@ -1045,19 +1038,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
- if (unlikely(ret)) {
- kfree(iovec);
+ if (unlikely(ret))
return ret;
- }
if (req->flags & REQ_F_ISREG)
kiocb_start_write(kiocb);
kiocb->ki_flags |= IOCB_WRITE;
if (likely(req->file->f_op->write_iter))
- ret2 = req->file->f_op->write_iter(kiocb, &s->iter);
+ ret2 = req->file->f_op->write_iter(kiocb, &io->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, rw, &s->iter);
+ ret2 = loop_rw_iter(WRITE, rw, &io->iter);
else
ret2 = -EINVAL;
@@ -1078,11 +1069,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
- goto copy_iov;
+ goto ret_eagain;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
- struct io_async_rw *io;
-
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
req->cqe.res, ret2);
@@ -1091,34 +1080,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
* in the worker. Also update bytes_done to account for
* the bytes already written.
*/
- iov_iter_save_state(&s->iter, &s->iter_state);
- ret = io_setup_async_rw(req, iovec, s, true);
-
- io = req->async_data;
- if (io)
- io->bytes_done += ret2;
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ io->bytes_done += ret2;
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
- return ret ? ret : -EAGAIN;
+ return -EAGAIN;
}
done:
- ret = kiocb_done(req, ret2, issue_flags);
+ return kiocb_done(req, ret2, issue_flags);
} else {
-copy_iov:
- iov_iter_restore(&s->iter, &s->iter_state);
- ret = io_setup_async_rw(req, iovec, s, false);
- if (!ret) {
- if (kiocb->ki_flags & IOCB_WRITE)
- io_req_end_write(req);
- return -EAGAIN;
- }
- return ret;
+ret_eagain:
+ iov_iter_restore(&io->iter, &io->iter_state);
+ if (kiocb->ki_flags & IOCB_WRITE)
+ io_req_end_write(req);
+ return -EAGAIN;
}
- /* it's reportedly faster than delegating the null check to kfree() */
- if (iovec)
- kfree(iovec);
- return ret;
}
void io_rw_fail(struct io_kiocb *req)
@@ -1192,6 +1169,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
break;
nr_events++;
req->cqe.flags = io_put_kbuf(req, 0);
+ if (req->opcode != IORING_OP_URING_CMD)
+ io_req_rw_cleanup(req, 0);
}
if (unlikely(!nr_events))
return 0;
@@ -1205,3 +1184,15 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
__io_submit_flush_completions(ctx);
return nr_events;
}
+
+void io_rw_cache_free(const void *entry)
+{
+ struct io_async_rw *rw = (struct io_async_rw *) entry;
+
+ if (rw->free_iovec) {
+ kasan_mempool_unpoison_object(rw->free_iovec,
+ rw->free_iov_nr * sizeof(struct iovec));
+ io_rw_iovec_free(rw);
+ }
+ kfree(rw);
+}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f9e89b4fe4da91..3f432dc75441fe 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,28 +2,27 @@
#include <linux/pagemap.h>
-struct io_rw_state {
- struct iov_iter iter;
- struct iov_iter_state iter_state;
- struct iovec fast_iov[UIO_FASTIOV];
-};
-
struct io_async_rw {
- struct io_rw_state s;
- const struct iovec *free_iovec;
size_t bytes_done;
+ struct iov_iter iter;
+ struct iov_iter_state iter_state;
+ struct iovec fast_iov;
+ struct iovec *free_iovec;
+ int free_iov_nr;
struct wait_page_queue wpq;
};
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read(struct io_kiocb *req, unsigned int issue_flags);
-int io_readv_prep_async(struct io_kiocb *req);
int io_write(struct io_kiocb *req, unsigned int issue_flags);
-int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
+void io_rw_cache_free(const void *entry);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 3983708cef5b43..554c7212aa4631 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -291,6 +291,14 @@ static int io_sq_thread(void *data)
sqd->sq_cpu = raw_smp_processor_id();
}
+ /*
+ * Force audit context to get setup, in case we do prep side async
+ * operations that would trigger an audit call before any issue side
+ * audit has been done.
+ */
+ audit_uring_entry(IORING_OP_NOP);
+ audit_uring_exit(true, 0);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7fd7dbb211d642..1c9bf07499b197 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
struct io_ring_ctx *ctx = req->ctx;
if (!io_timeout_finish(timeout, data)) {
- bool filled;
- filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
- IORING_CQE_F_MORE);
- if (filled) {
+ if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) {
/* re-arm timer */
spin_lock_irq(&ctx->timeout_lock);
list_add(&timeout->list, ctx->timeout_list.prev);
@@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
{
- unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_kiocb *prev = timeout->prev;
int ret = -ENOENT;
@@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
.data = prev->cqe.user_data,
};
- ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
+ ret = io_try_cancel(req->task->io_uring, &cd, 0);
}
io_req_set_res(req, ret ?: -ETIME, 0);
io_req_task_complete(req, ts);
@@ -541,7 +537,6 @@ static int __io_timeout_prep(struct io_kiocb *req,
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
return -EINVAL;
- INIT_LIST_HEAD(&timeout->list);
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 42f63adfa54a04..21ac5fb2d5f087 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,6 +3,7 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring/cmd.h>
+#include <linux/io_uring/net.h>
#include <linux/security.h>
#include <linux/nospec.h>
#include <net/sock.h>
@@ -11,9 +12,71 @@
#include <asm/ioctls.h>
#include "io_uring.h"
+#include "alloc_cache.h"
#include "rsrc.h"
#include "uring_cmd.h"
+static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct uring_cache *cache;
+
+ cache = io_alloc_cache_get(&ctx->uring_cache);
+ if (cache) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = cache;
+ return cache;
+ }
+ if (!io_alloc_async_data(req))
+ return req->async_data;
+ return NULL;
+}
+
+static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct uring_cache *cache = req->async_data;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ return;
+ if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
+ ioucmd->sqe = NULL;
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+ }
+}
+
+bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool ret = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+ hash_node) {
+ struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+ struct io_uring_cmd);
+ struct file *file = req->file;
+
+ if (!cancel_all && req->task != task)
+ continue;
+
+ if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+ /* ->sqe isn't available if no async data */
+ if (!req_has_async_data(req))
+ cmd->sqe = NULL;
+ file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL |
+ IO_URING_F_COMPLETE_DEFER);
+ ret = true;
+ }
+ }
+ io_submit_flush_completions(ctx);
+ return ret;
+}
+
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -56,9 +119,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
- ioucmd->task_work_cb(ioucmd, issue_flags);
+ /* task_work executor checks the deffered list completion */
+ ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
}
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
@@ -97,23 +160,38 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
io_req_set_res(req, ret, 0);
if (req->ctx->flags & IORING_SETUP_CQE32)
io_req_set_cqe32_extra(req, res2, 0);
+ io_req_uring_cleanup(req, issue_flags);
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1);
+ } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
+ if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED))
+ return;
+ io_req_complete_defer(req);
} else {
- struct io_tw_state ts = {
- .locked = !(issue_flags & IO_URING_F_UNLOCKED),
- };
- io_req_task_complete(req, &ts);
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req);
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
-int io_uring_cmd_prep_async(struct io_kiocb *req)
+static int io_uring_cmd_prep_setup(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct uring_cache *cache;
+
+ cache = io_uring_async_get(req);
+ if (unlikely(!cache))
+ return -ENOMEM;
+
+ if (!(req->flags & REQ_F_FORCE_ASYNC)) {
+ /* defer memcpy until we need it */
+ ioucmd->sqe = sqe;
+ return 0;
+ }
- memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx));
+ memcpy(req->async_data, sqe, uring_sqe_size(req->ctx));
ioucmd->sqe = req->async_data;
return 0;
}
@@ -140,9 +218,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->imu = ctx->user_bufs[index];
io_req_set_rsrc_node(req, ctx, 0);
}
- ioucmd->sqe = sqe;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
- return 0;
+
+ return io_uring_cmd_prep_setup(req, sqe);
}
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
@@ -174,22 +252,20 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
if (ret == -EAGAIN) {
- if (!req_has_async_data(req)) {
- if (io_alloc_async_data(req))
- return -ENOMEM;
- io_uring_cmd_prep_async(req);
- }
- return -EAGAIN;
- }
+ struct uring_cache *cache = req->async_data;
- if (ret != -EIOCBQUEUED) {
- if (ret < 0)
- req_set_fail(req);
- io_req_set_res(req, ret, 0);
- return ret;
+ if (ioucmd->sqe != (void *) cache)
+ memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx));
+ return -EAGAIN;
+ } else if (ret == -EIOCBQUEUED) {
+ return -EIOCBQUEUED;
}
- return IOU_ISSUE_SKIP_COMPLETE;
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_uring_cleanup(req, issue_flags);
+ io_req_set_res(req, ret, 0);
+ return ret;
}
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 8117684ec3cac3..a361f98664d2de 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,5 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+struct uring_cache {
+ struct io_uring_sqe sqes[2];
+};
+
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_uring_cmd_prep_async(struct io_kiocb *req);
+
+bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 77d340666cb95b..6362ec20abc0cf 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret)
static void io_waitid_complete(struct io_kiocb *req, int ret)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
- struct io_tw_state ts = { .locked = true };
+ struct io_tw_state ts = {};
/* anyone completing better be holding a reference */
WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 92c6b1fd898938..1e453f825c05d3 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -494,18 +494,18 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
struct sbitmap_word *map = &sb->map[index];
unsigned long get_mask;
unsigned int map_depth = __map_depth(sb, index);
+ unsigned long val;
sbitmap_deferred_clear(map);
- if (map->word == (1UL << (map_depth - 1)) - 1)
+ val = READ_ONCE(map->word);
+ if (val == (1UL << (map_depth - 1)) - 1)
goto next;
- nr = find_first_zero_bit(&map->word, map_depth);
+ nr = find_first_zero_bit(&val, map_depth);
if (nr + nr_tags <= map_depth) {
atomic_long_t *ptr = (atomic_long_t *) &map->word;
- unsigned long val;
get_mask = ((1UL << nr_tags) - 1) << nr;
- val = READ_ONCE(map->word);
while (!atomic_long_try_cmpxchg(ptr, &val,
get_mask | val))
;
diff --git a/mm/nommu.c b/mm/nommu.c
index 331d2f778695c9..7296e775e04e29 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -334,6 +334,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
diff --git a/net/socket.c b/net/socket.c
index e5f3af49a8b621..01a71ae10c35dc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -88,7 +88,7 @@
#include <linux/xattr.h>
#include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/net.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>