From: Jens Axboe This reworks the core barrier support to be a lot nicer, so that all the nasty code resides outside of drivers/ide. It requires minimal changes to support in a driver, I've added SCSI support as an example. The ide code is adapted to the new code. With this patch, we support full barriers on sata now. Bart has acked the addition to -mm, I would like for this to be submitted as soon as 2.6.12 opens. Signed-off-by: Jens Axboe Signed-off-by: Andrew Morton --- 25-akpm/drivers/block/elevator.c | 16 ++ 25-akpm/drivers/block/ll_rw_blk.c | 228 ++++++++++++++++++++++++++++++++---- 25-akpm/drivers/ide/ide-disk.c | 73 ++++++++++- 25-akpm/drivers/ide/ide-io.c | 164 +------------------------ 25-akpm/drivers/scsi/ahci.c | 1 25-akpm/drivers/scsi/ata_piix.c | 1 25-akpm/drivers/scsi/hosts.c | 10 + 25-akpm/drivers/scsi/sata_nv.c | 1 25-akpm/drivers/scsi/sata_promise.c | 1 25-akpm/drivers/scsi/sata_sil.c | 1 25-akpm/drivers/scsi/sata_sis.c | 1 25-akpm/drivers/scsi/sata_svw.c | 1 25-akpm/drivers/scsi/sata_sx4.c | 1 25-akpm/drivers/scsi/sata_uli.c | 1 25-akpm/drivers/scsi/sata_via.c | 1 25-akpm/drivers/scsi/sata_vsc.c | 1 25-akpm/drivers/scsi/scsi_lib.c | 46 +++++++ 25-akpm/drivers/scsi/sd.c | 31 ++++ 25-akpm/include/linux/blkdev.h | 24 +++ 25-akpm/include/linux/ide.h | 2 25-akpm/include/scsi/scsi_driver.h | 2 25-akpm/include/scsi/scsi_host.h | 12 + 22 files changed, 433 insertions(+), 186 deletions(-) diff -puN drivers/block/elevator.c~rework-core-barrier-support drivers/block/elevator.c --- 25/drivers/block/elevator.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/block/elevator.c 2005-02-24 23:13:56.000000000 -0800 @@ -320,7 +320,21 @@ void elv_add_request(request_queue_t *q, static inline struct request *__elv_next_request(request_queue_t *q) { - return q->elevator->ops->elevator_next_req_fn(q); + struct request *rq = q->elevator->ops->elevator_next_req_fn(q); + + /* + * if this is a barrier write and the device has to issue a + * flush sequence to support it, check how far we are + */ + if (rq && blk_fs_request(rq) && blk_barrier_rq(rq)) { + BUG_ON(q->ordered == QUEUE_ORDERED_NONE); + + if (q->ordered == QUEUE_ORDERED_FLUSH && + !blk_barrier_preflush(rq)) + rq = blk_start_pre_flush(q, rq); + } + + return rq; } struct request *elv_next_request(request_queue_t *q) diff -puN drivers/block/ll_rw_blk.c~rework-core-barrier-support drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/block/ll_rw_blk.c 2005-02-24 23:13:56.000000000 -0800 @@ -267,6 +267,25 @@ void blk_queue_make_request(request_queu EXPORT_SYMBOL(blk_queue_make_request); +static inline void rq_init(request_queue_t *q, struct request *rq) +{ + INIT_LIST_HEAD(&rq->queuelist); + + rq->errors = 0; + rq->rq_status = RQ_ACTIVE; + rq->bio = rq->biotail = NULL; + rq->buffer = NULL; + rq->ref_count = 1; + rq->q = q; + rq->waiting = NULL; + rq->special = NULL; + rq->data_len = 0; + rq->data = NULL; + rq->sense = NULL; + rq->end_io = NULL; + rq->end_io_data = NULL; +} + /** * blk_queue_ordered - does this queue support ordered writes * @q: the request queue @@ -281,10 +300,26 @@ EXPORT_SYMBOL(blk_queue_make_request); **/ void blk_queue_ordered(request_queue_t *q, int flag) { - if (flag) - set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); - else - clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); + switch (flag) { + case QUEUE_ORDERED_NONE: + if (q->flush_rq) + kmem_cache_free(request_cachep, q->flush_rq); + q->flush_rq = NULL; + q->ordered = flag; + break; + case QUEUE_ORDERED_TAG: + q->ordered = flag; + break; + case QUEUE_ORDERED_FLUSH: + q->ordered = flag; + if (!q->flush_rq) + q->flush_rq = kmem_cache_alloc(request_cachep, + GFP_KERNEL); + break; + default: + printk("blk_queue_ordered: bad value %d\n", flag); + break; + } } EXPORT_SYMBOL(blk_queue_ordered); @@ -306,6 +341,170 @@ void blk_queue_issue_flush_fn(request_qu EXPORT_SYMBOL(blk_queue_issue_flush_fn); +/* + * Cache flushing for ordered writes handling + */ +static void blk_pre_flush_end_io(struct request *flush_rq) +{ + struct request *rq = flush_rq->end_io_data; + request_queue_t *q = rq->q; + + rq->flags |= REQ_BAR_PREFLUSH; + + if (!flush_rq->errors) + elv_requeue_request(q, rq); + else { + q->end_flush_fn(q, flush_rq); + clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); + } +} + +static void blk_post_flush_end_io(struct request *flush_rq) +{ + struct request *rq = flush_rq->end_io_data; + request_queue_t *q = rq->q; + + rq->flags |= REQ_BAR_POSTFLUSH; + + /* + * called from end_that_request_last(), so we know that the queue + * lock is held + */ + spin_unlock(q->queue_lock); + q->end_flush_fn(q, flush_rq); + spin_lock(q->queue_lock); + + clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); +} + +struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq) +{ + struct request *flush_rq = q->flush_rq; + + BUG_ON(!blk_barrier_rq(rq)); + + rq_init(q, flush_rq); + flush_rq->elevator_private = NULL; + flush_rq->flags = 0; + flush_rq->rq_disk = rq->rq_disk; + flush_rq->rl = NULL; + + /* + * prepare_flush returns 0 if no flush is needed, just mark both + * pre and post flush as done in that case + */ + if (!q->prepare_flush_fn(q, flush_rq)) { + rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH; + return rq; + } + + set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); + + /* + * some drivers dequeue requests right away, some only after io + * completion. make sure the request is dequeued. + */ + if (!list_empty(&rq->queuelist)) + blkdev_dequeue_request(rq); + + flush_rq->end_io_data = rq; + flush_rq->end_io = blk_pre_flush_end_io; + + __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); + return flush_rq; +} + +static void blk_start_post_flush(request_queue_t *q, struct request *rq) +{ + struct request *flush_rq = q->flush_rq; + + BUG_ON(!blk_barrier_rq(rq)); + + rq_init(q, flush_rq); + flush_rq->elevator_private = NULL; + flush_rq->flags = 0; + flush_rq->rq_disk = rq->rq_disk; + flush_rq->rl = NULL; + + if (q->prepare_flush_fn(q, flush_rq)) { + flush_rq->end_io_data = rq; + flush_rq->end_io = blk_post_flush_end_io; + + __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); + q->request_fn(q); + } +} + +static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq, + int sectors) +{ + if (sectors > rq->nr_sectors) + sectors = rq->nr_sectors; + + rq->nr_sectors -= sectors; + return rq->nr_sectors; +} + +static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq, + int sectors, int queue_locked) +{ + if (q->ordered != QUEUE_ORDERED_FLUSH) + return 0; + if (!blk_fs_request(rq) || !blk_barrier_rq(rq)) + return 0; + if (blk_barrier_postflush(rq)) + return 0; + + if (!blk_check_end_barrier(q, rq, sectors)) { + unsigned long flags = 0; + + if (!queue_locked) + spin_lock_irqsave(q->queue_lock, flags); + + blk_start_post_flush(q, rq); + + if (!queue_locked) + spin_unlock_irqrestore(q->queue_lock, flags); + } + + return 1; +} + +/** + * blk_complete_barrier_rq - complete possible barrier request + * @q: the request queue for the device + * @rq: the request + * @sectors: number of sectors to complete + * + * Description: + * Used in driver end_io handling to determine whether to postpone + * completion of a barrier request until a post flush has been done. This + * is the unlocked variant, used if the caller doesn't already hold the + * queue lock. + **/ +int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors) +{ + return __blk_complete_barrier_rq(q, rq, sectors, 0); +} +EXPORT_SYMBOL(blk_complete_barrier_rq); + +/** + * blk_complete_barrier_rq_locked - complete possible barrier request + * @q: the request queue for the device + * @rq: the request + * @sectors: number of sectors to complete + * + * Description: + * See blk_complete_barrier_rq(). This variant must be used if the caller + * holds the queue lock. + **/ +int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq, + int sectors) +{ + return __blk_complete_barrier_rq(q, rq, sectors, 1); +} +EXPORT_SYMBOL(blk_complete_barrier_rq_locked); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device @@ -1428,6 +1627,8 @@ void blk_cleanup_queue(request_queue_t * if (q->queue_tags) __blk_queue_free_tags(q); + blk_queue_ordered(q, QUEUE_ORDERED_NONE); + kmem_cache_free(requestq_cachep, q); } @@ -1739,23 +1940,8 @@ rq_starved: if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; - INIT_LIST_HEAD(&rq->queuelist); - - rq->errors = 0; - rq->rq_status = RQ_ACTIVE; - rq->bio = rq->biotail = NULL; - rq->buffer = NULL; - rq->ref_count = 1; - rq->q = q; + rq_init(q, rq); rq->rl = rl; - rq->waiting = NULL; - rq->special = NULL; - rq->data_len = 0; - rq->data = NULL; - rq->sense = NULL; - rq->end_io = NULL; - rq->end_io_data = NULL; - out: put_io_context(ioc); return rq; @@ -2392,7 +2578,7 @@ static int __make_request(request_queue_ spin_lock_prefetch(q->queue_lock); barrier = bio_barrier(bio); - if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) { + if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) { err = -EOPNOTSUPP; goto end_io; } diff -puN drivers/ide/ide-disk.c~rework-core-barrier-support drivers/ide/ide-disk.c --- 25/drivers/ide/ide-disk.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/ide/ide-disk.c 2005-02-24 23:13:56.000000000 -0800 @@ -683,18 +683,54 @@ static ide_proc_entry_t idedisk_proc[] = #endif /* CONFIG_PROC_FS */ -static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) +static void idedisk_end_flush(request_queue_t *q, struct request *flush_rq) +{ + ide_drive_t *drive = q->queuedata; + struct request *rq = flush_rq->end_io_data; + int good_sectors = rq->hard_nr_sectors; + int bad_sectors; + sector_t sector; + + if (flush_rq->errors & ABRT_ERR) { + printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name); + blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE); + blk_queue_issue_flush_fn(drive->queue, NULL); + good_sectors = 0; + } else if (flush_rq->errors) { + sector = ide_get_error_location(drive, flush_rq->buffer); + if ((sector >= rq->hard_sector) && + (sector < rq->hard_sector + rq->hard_nr_sectors)) + good_sectors = sector - rq->hard_sector; + else + good_sectors = 0; + } + + if (flush_rq->errors) + printk(KERN_ERR "%s: failed barrier write: " + "sector=%Lx(good=%d/bad=%d)\n", + drive->name, (unsigned long long)rq->sector, + good_sectors, + (int) (rq->hard_nr_sectors-good_sectors)); + + bad_sectors = rq->hard_nr_sectors - good_sectors; + + spin_lock(&ide_lock); + + if (good_sectors) + __ide_end_request(drive, rq, 1, good_sectors); + if (bad_sectors) + __ide_end_request(drive, rq, 0, bad_sectors); + + spin_unlock(&ide_lock); +} + +static int idedisk_prepare_flush(request_queue_t *q, struct request *rq) { ide_drive_t *drive = q->queuedata; - struct request *rq; - int ret; if (!drive->wcache) return 0; - rq = blk_get_request(q, WRITE, __GFP_WAIT); - memset(rq->cmd, 0, sizeof(rq->cmd)); if (ide_id_has_flush_cache_ext(drive->id) && @@ -706,6 +742,22 @@ static int idedisk_issue_flush(request_q rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER; rq->buffer = rq->cmd; + return 1; +} + +static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + ide_drive_t *drive = q->queuedata; + struct request *rq; + int ret; + + if (!drive->wcache) + return 0; + + rq = blk_get_request(q, WRITE, __GFP_WAIT); + + idedisk_prepare_flush(q, rq); ret = blk_execute_rq(q, disk, rq); @@ -963,10 +1015,15 @@ static void idedisk_setup (ide_drive_t * barrier = 0; } - printk(KERN_DEBUG "%s: cache flushes %ssupported\n", + if (!strncmp(drive->name, "hdc", 3)) + barrier = 1; + + printk(KERN_INFO "%s: cache flushes %ssupported\n", drive->name, barrier ? "" : "not "); if (barrier) { - blk_queue_ordered(drive->queue, 1); + blk_queue_ordered(drive->queue, QUEUE_ORDERED_FLUSH); + drive->queue->prepare_flush_fn = idedisk_prepare_flush; + drive->queue->end_flush_fn = idedisk_end_flush; blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush); } } diff -puN drivers/ide/ide-io.c~rework-core-barrier-support drivers/ide/ide-io.c --- 25/drivers/ide/ide-io.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/ide/ide-io.c 2005-02-24 23:13:56.000000000 -0800 @@ -55,62 +55,8 @@ #include #include -static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq) -{ - char *buf = rq->cmd; - - /* - * reuse cdb space for ata command - */ - memset(buf, 0, sizeof(rq->cmd)); - - rq->flags |= REQ_DRIVE_TASK | REQ_STARTED; - rq->buffer = buf; - rq->buffer[0] = WIN_FLUSH_CACHE; - - if (ide_id_has_flush_cache_ext(drive->id) && - (drive->capacity64 >= (1UL << 28))) - rq->buffer[0] = WIN_FLUSH_CACHE_EXT; -} - -/* - * preempt pending requests, and store this cache flush for immediate - * execution - */ -static struct request *ide_queue_flush_cmd(ide_drive_t *drive, - struct request *rq, int post) -{ - struct request *flush_rq = &HWGROUP(drive)->wrq; - - /* - * write cache disabled, clear the barrier bit and treat it like - * an ordinary write - */ - if (!drive->wcache) { - rq->flags |= REQ_BAR_PREFLUSH; - return rq; - } - - ide_init_drive_cmd(flush_rq); - ide_fill_flush_cmd(drive, flush_rq); - - flush_rq->special = rq; - flush_rq->nr_sectors = rq->nr_sectors; - - if (!post) { - drive->doing_barrier = 1; - flush_rq->flags |= REQ_BAR_PREFLUSH; - blkdev_dequeue_request(rq); - } else - flush_rq->flags |= REQ_BAR_POSTFLUSH; - - __elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0); - HWGROUP(drive)->rq = NULL; - return flush_rq; -} - -static int __ide_end_request(ide_drive_t *drive, struct request *rq, - int uptodate, int nr_sectors) +int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate, + int nr_sectors) { int ret = 1; @@ -148,6 +94,7 @@ static int __ide_end_request(ide_drive_t } return ret; } +EXPORT_SYMBOL(__ide_end_request); /** * ide_end_request - complete an IDE I/O @@ -172,17 +119,10 @@ int ide_end_request (ide_drive_t *drive, if (!nr_sectors) nr_sectors = rq->hard_cur_sectors; - if (!blk_barrier_rq(rq) || !drive->wcache) + if (blk_complete_barrier_rq_locked(drive->queue, rq, nr_sectors)) + ret = rq->nr_sectors != 0; + else ret = __ide_end_request(drive, rq, uptodate, nr_sectors); - else { - struct request *flush_rq = &HWGROUP(drive)->wrq; - - flush_rq->nr_sectors -= nr_sectors; - if (!flush_rq->nr_sectors) { - ide_queue_flush_cmd(drive, rq, 1); - ret = 0; - } - } spin_unlock_irqrestore(&ide_lock, flags); return ret; @@ -347,79 +287,6 @@ u64 ide_get_error_location(ide_drive_t * } EXPORT_SYMBOL(ide_get_error_location); -static void ide_complete_barrier(ide_drive_t *drive, struct request *rq, - int error) -{ - struct request *real_rq = rq->special; - int good_sectors, bad_sectors; - sector_t sector; - - if (!error) { - if (blk_barrier_postflush(rq)) { - /* - * this completes the barrier write - */ - __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors); - drive->doing_barrier = 0; - } else { - /* - * just indicate that we did the pre flush - */ - real_rq->flags |= REQ_BAR_PREFLUSH; - elv_requeue_request(drive->queue, real_rq); - } - /* - * all is fine, return - */ - return; - } - - /* - * we need to end real_rq, but it's not on the queue currently. - * put it back on the queue, so we don't have to special case - * anything else for completing it - */ - if (!blk_barrier_postflush(rq)) - elv_requeue_request(drive->queue, real_rq); - - /* - * drive aborted flush command, assume FLUSH_CACHE_* doesn't - * work and disable barrier support - */ - if (error & ABRT_ERR) { - printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name); - __ide_end_request(drive, real_rq, -EOPNOTSUPP, real_rq->hard_nr_sectors); - blk_queue_ordered(drive->queue, 0); - blk_queue_issue_flush_fn(drive->queue, NULL); - } else { - /* - * find out what part of the request failed - */ - good_sectors = 0; - if (blk_barrier_postflush(rq)) { - sector = ide_get_error_location(drive, rq->buffer); - - if ((sector >= real_rq->hard_sector) && - (sector < real_rq->hard_sector + real_rq->hard_nr_sectors)) - good_sectors = sector - real_rq->hard_sector; - } else - sector = real_rq->hard_sector; - - bad_sectors = real_rq->hard_nr_sectors - good_sectors; - if (good_sectors) - __ide_end_request(drive, real_rq, 1, good_sectors); - if (bad_sectors) - __ide_end_request(drive, real_rq, 0, bad_sectors); - - printk(KERN_ERR "%s: failed barrier write: " - "sector=%Lx(good=%d/bad=%d)\n", - drive->name, (unsigned long long)sector, - good_sectors, bad_sectors); - } - - drive->doing_barrier = 0; -} - /** * ide_end_drive_cmd - end an explicit drive command * @drive: command @@ -511,11 +378,8 @@ void ide_end_drive_cmd (ide_drive_t *dri spin_lock_irqsave(&ide_lock, flags); blkdev_dequeue_request(rq); - - if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) - ide_complete_barrier(drive, rq, err); - HWGROUP(drive)->rq = NULL; + rq->errors = err; end_that_request_last(rq); spin_unlock_irqrestore(&ide_lock, flags); } @@ -1152,12 +1016,13 @@ void ide_pin_hwgroup(ide_drive_t *drive) spin_lock_irq(&ide_lock); do { - if (!hwgroup->busy && !drive->blocked && !drive->doing_barrier) + if (!hwgroup->busy && !drive->blocked && + !blk_queue_flushing(drive->queue)) break; spin_unlock_irq(&ide_lock); schedule_timeout(HZ/100); spin_lock_irq(&ide_lock); - } while (hwgroup->busy || drive->blocked || drive->doing_barrier); + } while (hwgroup->busy || drive->blocked || blk_queue_flushing(drive->queue)); /* * we've now secured exclusive access to this hwgroup @@ -1193,7 +1058,7 @@ repeat: * though that is 3 requests, it must be seen as a single transaction. * we must not preempt this drive until that is complete */ - if (drive->doing_barrier) { + if (blk_queue_flushing(drive->queue)) { /* * small race where queue could get replugged during * the 3-request flush cycle, just yank the plug since @@ -1358,13 +1223,6 @@ static void ide_do_request (ide_hwgroup_ } /* - * if rq is a barrier write, issue pre cache flush if not - * already done - */ - if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq)) - rq = ide_queue_flush_cmd(drive, rq, 0); - - /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as * blk_stop_queue() doesn't prevent the elv_next_request() diff -puN drivers/scsi/ahci.c~rework-core-barrier-support drivers/scsi/ahci.c --- 25/drivers/scsi/ahci.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/ahci.c 2005-02-24 23:13:56.000000000 -0800 @@ -199,6 +199,7 @@ static Scsi_Host_Template ahci_sht = { .dma_boundary = AHCI_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations ahci_ops = { diff -puN drivers/scsi/ata_piix.c~rework-core-barrier-support drivers/scsi/ata_piix.c --- 25/drivers/scsi/ata_piix.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/ata_piix.c 2005-02-24 23:13:56.000000000 -0800 @@ -121,6 +121,7 @@ static Scsi_Host_Template piix_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations piix_pata_ops = { diff -puN drivers/scsi/hosts.c~rework-core-barrier-support drivers/scsi/hosts.c --- 25/drivers/scsi/hosts.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/hosts.c 2005-02-24 23:13:56.000000000 -0800 @@ -247,6 +247,16 @@ struct Scsi_Host *scsi_host_alloc(struct shost->cmd_per_lun = sht->cmd_per_lun; shost->unchecked_isa_dma = sht->unchecked_isa_dma; shost->use_clustering = sht->use_clustering; + shost->ordered_flush = sht->ordered_flush; + shost->ordered_tag = sht->ordered_tag; + + /* + * hosts/devices that do queueing must support ordered tags + */ + if (shost->can_queue > 1 && shost->ordered_flush) { + printk(KERN_ERR "scsi: ordered flushes don't support queueing\n"); + shost->ordered_flush = 0; + } if (sht->max_host_blocked) shost->max_host_blocked = sht->max_host_blocked; diff -puN drivers/scsi/sata_nv.c~rework-core-barrier-support drivers/scsi/sata_nv.c --- 25/drivers/scsi/sata_nv.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_nv.c 2005-02-24 23:13:56.000000000 -0800 @@ -205,6 +205,7 @@ static Scsi_Host_Template nv_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations nv_ops = { diff -puN drivers/scsi/sata_promise.c~rework-core-barrier-support drivers/scsi/sata_promise.c --- 25/drivers/scsi/sata_promise.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_promise.c 2005-02-24 23:13:56.000000000 -0800 @@ -102,6 +102,7 @@ static Scsi_Host_Template pdc_ata_sht = .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations pdc_ata_ops = { diff -puN drivers/scsi/sata_sil.c~rework-core-barrier-support drivers/scsi/sata_sil.c --- 25/drivers/scsi/sata_sil.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_sil.c 2005-02-24 23:13:56.000000000 -0800 @@ -125,6 +125,7 @@ static Scsi_Host_Template sil_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations sil_ops = { diff -puN drivers/scsi/sata_sis.c~rework-core-barrier-support drivers/scsi/sata_sis.c --- 25/drivers/scsi/sata_sis.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_sis.c 2005-02-24 23:13:56.000000000 -0800 @@ -90,6 +90,7 @@ static Scsi_Host_Template sis_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations sis_ops = { diff -puN drivers/scsi/sata_svw.c~rework-core-barrier-support drivers/scsi/sata_svw.c --- 25/drivers/scsi/sata_svw.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_svw.c 2005-02-24 23:13:56.000000000 -0800 @@ -288,6 +288,7 @@ static Scsi_Host_Template k2_sata_sht = .proc_info = k2_sata_proc_info, #endif .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; diff -puN drivers/scsi/sata_sx4.c~rework-core-barrier-support drivers/scsi/sata_sx4.c --- 25/drivers/scsi/sata_sx4.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_sx4.c 2005-02-24 23:13:56.000000000 -0800 @@ -188,6 +188,7 @@ static Scsi_Host_Template pdc_sata_sht = .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations pdc_20621_ops = { diff -puN drivers/scsi/sata_uli.c~rework-core-barrier-support drivers/scsi/sata_uli.c --- 25/drivers/scsi/sata_uli.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_uli.c 2005-02-24 23:13:56.000000000 -0800 @@ -82,6 +82,7 @@ static Scsi_Host_Template uli_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations uli_ops = { diff -puN drivers/scsi/sata_via.c~rework-core-barrier-support drivers/scsi/sata_via.c --- 25/drivers/scsi/sata_via.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_via.c 2005-02-24 23:13:56.000000000 -0800 @@ -102,6 +102,7 @@ static Scsi_Host_Template svia_sht = { .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; static struct ata_port_operations svia_sata_ops = { diff -puN drivers/scsi/sata_vsc.c~rework-core-barrier-support drivers/scsi/sata_vsc.c --- 25/drivers/scsi/sata_vsc.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sata_vsc.c 2005-02-24 23:13:56.000000000 -0800 @@ -204,6 +204,7 @@ static Scsi_Host_Template vsc_sata_sht = .dma_boundary = ATA_DMA_BOUNDARY, .slave_configure = ata_scsi_slave_config, .bios_param = ata_std_bios_param, + .ordered_flush = 1, }; diff -puN drivers/scsi/scsi_lib.c~rework-core-barrier-support drivers/scsi/scsi_lib.c --- 25/drivers/scsi/scsi_lib.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/scsi_lib.c 2005-02-24 23:13:56.000000000 -0800 @@ -697,6 +697,9 @@ void scsi_io_completion(struct scsi_cmnd int sense_valid = 0; int sense_deferred = 0; + if (blk_complete_barrier_rq(q, req, good_bytes << 9)) + return; + /* * Free up any indirection buffers we allocated for DMA purposes. * For the case of a READ, we need to copy the data out of the @@ -962,6 +965,38 @@ static int scsi_init_io(struct scsi_cmnd return BLKPREP_KILL; } +static int scsi_prepare_flush_fn(request_queue_t *q, struct request *rq) +{ + struct scsi_device *sdev = q->queuedata; + struct scsi_driver *drv; + + if (sdev->sdev_state == SDEV_RUNNING) { + drv = *(struct scsi_driver **) rq->rq_disk->private_data; + + if (drv->prepare_flush) + return drv->prepare_flush(q, rq); + } + + return 0; +} + +static void scsi_end_flush_fn(request_queue_t *q, struct request *rq) +{ + struct scsi_device *sdev = q->queuedata; + struct request *flush_rq = rq->end_io_data; + struct scsi_driver *drv; + + if (flush_rq->errors) { + printk("scsi: barrier error, disabling flush support\n"); + blk_queue_ordered(q, QUEUE_ORDERED_NONE); + } + + if (sdev->sdev_state == SDEV_RUNNING) { + drv = *(struct scsi_driver **) rq->rq_disk->private_data; + drv->end_flush(q, rq); + } +} + static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, sector_t *error_sector) { @@ -1366,6 +1401,17 @@ struct request_queue *scsi_alloc_queue(s blk_queue_segment_boundary(q, shost->dma_boundary); blk_queue_issue_flush_fn(q, scsi_issue_flush_fn); + /* + * ordered tags are superior to flush ordering + */ + if (shost->ordered_tag) + blk_queue_ordered(q, QUEUE_ORDERED_TAG); + else if (shost->ordered_flush) { + blk_queue_ordered(q, QUEUE_ORDERED_FLUSH); + q->prepare_flush_fn = scsi_prepare_flush_fn; + q->end_flush_fn = scsi_end_flush_fn; + } + if (!shost->use_clustering) clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); return q; diff -puN drivers/scsi/sd.c~rework-core-barrier-support drivers/scsi/sd.c --- 25/drivers/scsi/sd.c~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/drivers/scsi/sd.c 2005-02-24 23:13:56.000000000 -0800 @@ -122,6 +122,8 @@ static void sd_shutdown(struct device *d static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *); static int sd_issue_flush(struct device *, sector_t *); +static void sd_end_flush(request_queue_t *, struct request *); +static int sd_prepare_flush(request_queue_t *, struct request *); static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname, struct scsi_request *SRpnt, unsigned char *buffer); @@ -136,6 +138,8 @@ static struct scsi_driver sd_template = .rescan = sd_rescan, .init_command = sd_init_command, .issue_flush = sd_issue_flush, + .prepare_flush = sd_prepare_flush, + .end_flush = sd_end_flush, }; /* @@ -735,6 +739,33 @@ static int sd_issue_flush(struct device return sd_sync_cache(sdp); } +static void sd_end_flush(request_queue_t *q, struct request *flush_rq) +{ + struct request *rq = flush_rq->end_io_data; + struct scsi_cmnd *cmd = rq->special; + unsigned int bytes = rq->hard_nr_sectors << 9; + + if (!flush_rq->errors) + scsi_io_completion(cmd, bytes, 0); + else + scsi_io_completion(cmd, 0, bytes); +} + +static int sd_prepare_flush(request_queue_t *q, struct request *rq) +{ + struct scsi_device *sdev = q->queuedata; + struct scsi_disk *sdkp = dev_get_drvdata(&sdev->sdev_gendev); + + if (sdkp->WCE) { + memset(rq->cmd, 0, sizeof(rq->cmd)); + rq->flags = REQ_BLOCK_PC | REQ_SOFTBARRIER; + rq->cmd[0] = SYNCHRONIZE_CACHE; + return 1; + } + + return 0; +} + static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); diff -puN include/linux/blkdev.h~rework-core-barrier-support include/linux/blkdev.h --- 25/include/linux/blkdev.h~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/include/linux/blkdev.h 2005-02-24 23:13:56.000000000 -0800 @@ -275,6 +275,8 @@ struct bio_vec; typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *); typedef void (activity_fn) (void *data, int rw); typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *); +typedef int (prepare_flush_fn) (request_queue_t *, struct request *); +typedef void (end_flush_fn) (request_queue_t *, struct request *); enum blk_queue_state { Queue_down, @@ -318,6 +320,8 @@ struct request_queue merge_bvec_fn *merge_bvec_fn; activity_fn *activity_fn; issue_flush_fn *issue_flush_fn; + prepare_flush_fn *prepare_flush_fn; + end_flush_fn *end_flush_fn; /* * Auto-unplugging state @@ -389,6 +393,18 @@ struct request_queue unsigned int sg_reserved_size; struct list_head drain_list; + + /* + * reserved for flush operations + */ + struct request *flush_rq; + unsigned char ordered; +}; + +enum { + QUEUE_ORDERED_NONE, + QUEUE_ORDERED_TAG, + QUEUE_ORDERED_FLUSH, }; #define RQ_INACTIVE (-1) @@ -405,12 +421,13 @@ struct request_queue #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ #define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ -#define QUEUE_FLAG_ORDERED 8 /* supports ordered writes */ -#define QUEUE_FLAG_DRAIN 9 /* draining queue for sched switch */ +#define QUEUE_FLAG_DRAIN 8 /* draining queue for sched switch */ +#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */ #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) +#define blk_queue_flushing(q) test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->flags & REQ_CMD) #define blk_pc_request(rq) ((rq)->flags & REQ_BLOCK_PC) @@ -611,6 +628,9 @@ extern struct backing_dev_info *blk_get_ extern void blk_queue_ordered(request_queue_t *, int); extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *); extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *); +extern struct request *blk_start_pre_flush(request_queue_t *,struct request *); +extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int); +extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); diff -puN include/linux/ide.h~rework-core-barrier-support include/linux/ide.h --- 25/include/linux/ide.h~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/include/linux/ide.h 2005-02-24 23:13:56.000000000 -0800 @@ -741,7 +741,6 @@ typedef struct ide_drive_s { u8 sect; /* "real" sectors per track */ u8 bios_head; /* BIOS/fdisk/LILO number of heads */ u8 bios_sect; /* BIOS/fdisk/LILO sectors per track */ - u8 doing_barrier; /* state, 1=currently doing flush */ unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned int cyl; /* "real" number of cyls */ @@ -1125,6 +1124,7 @@ extern ide_hwif_t ide_hwifs[]; /* maste extern int noautodma; extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs); +extern int __ide_end_request (ide_drive_t *drive, struct request *rq, int uptodate, int nrsecs); /* * This is used on exit from the driver to designate the next irq handler diff -puN include/scsi/scsi_driver.h~rework-core-barrier-support include/scsi/scsi_driver.h --- 25/include/scsi/scsi_driver.h~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/include/scsi/scsi_driver.h 2005-02-24 23:13:56.000000000 -0800 @@ -14,6 +14,8 @@ struct scsi_driver { int (*init_command)(struct scsi_cmnd *); void (*rescan)(struct device *); int (*issue_flush)(struct device *, sector_t *); + int (*prepare_flush)(struct request_queue *, struct request *); + void (*end_flush)(struct request_queue *, struct request *); }; #define to_scsi_driver(drv) \ container_of((drv), struct scsi_driver, gendrv) diff -puN include/scsi/scsi_host.h~rework-core-barrier-support include/scsi/scsi_host.h --- 25/include/scsi/scsi_host.h~rework-core-barrier-support 2005-02-24 23:13:56.000000000 -0800 +++ 25-akpm/include/scsi/scsi_host.h 2005-02-24 23:13:56.000000000 -0800 @@ -363,6 +363,12 @@ struct scsi_host_template { unsigned skip_settle_delay:1; /* + * ordered write support + */ + unsigned ordered_flush:1; + unsigned ordered_tag:1; + + /* * Countdown for host blocking with no commands outstanding */ unsigned int max_host_blocked; @@ -502,6 +508,12 @@ struct Scsi_Host { unsigned reverse_ordering:1; /* + * ordered write support + */ + unsigned ordered_flush:1; + unsigned ordered_tag:1; + + /* * Host has rejected a command because it was busy. */ unsigned int host_blocked; _