X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=block%2Fblk-core.c;h=4afb39c823396ccec50c3ded2673d22ed0b07cf3;hb=b57d426445c98789265de6a9338cdb06462d15fb;hp=937f9d0b9bd59111bd2937e6958f4b1cc9dd0733;hpb=8324aa91d1e11a1fc25f209687a0b2e6c2ed47d0;p=linux-2.6-omap-h63xx.git diff --git a/block/blk-core.c b/block/blk-core.c index 937f9d0b9bd..4afb39c8233 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3,7 +3,8 @@ * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli SuSE * Queue request tables / lock, selectable elevator, Jens Axboe - * kernel-doc documentation started by NeilBrown - July2000 + * kernel-doc documentation started by NeilBrown + * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe - may 2001 */ @@ -20,7 +21,6 @@ #include #include #include -#include /* for max_pfn/max_low_pfn */ #include #include #include @@ -30,24 +30,10 @@ #include #include #include -#include #include "blk.h" -/* - * for max sense size - */ -#include - -static void blk_unplug_work(struct work_struct *work); -static void blk_unplug_timeout(unsigned long data); -static void drive_stat_acct(struct request *rq, int new_io); -static void init_request_from_bio(struct request *req, struct bio *bio); static int __make_request(struct request_queue *q, struct bio *bio); -static struct io_context *current_io_context(gfp_t gfp_flags, int node); -static void blk_recalc_rq_segments(struct request *rq); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio); /* * For the allocated request tables @@ -57,30 +43,29 @@ struct kmem_cache *request_cachep; /* * For queue allocation */ -struct kmem_cache *blk_requestq_cachep = NULL; - -/* - * For io context allocations - */ -static struct kmem_cache *iocontext_cachep; +struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; -unsigned long blk_max_low_pfn, blk_max_pfn; - -EXPORT_SYMBOL(blk_max_low_pfn); -EXPORT_SYMBOL(blk_max_pfn); - static DEFINE_PER_CPU(struct list_head, blk_cpu_done); -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) +static void drive_stat_acct(struct request *rq, int new_io) +{ + int rw = rq_data_dir(rq); -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 + if (!blk_fs_request(rq) || !rq->rq_disk) + return; + + if (!new_io) { + __disk_stat_inc(rq->rq_disk, merges[rw]); + } else { + disk_round_stats(rq->rq_disk); + rq->rq_disk->in_flight++; + } +} void blk_queue_congestion_threshold(struct request_queue *q) { @@ -117,113 +102,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) } EXPORT_SYMBOL(blk_get_backing_dev_info); -/** - * blk_queue_prep_rq - set a prepare_request function for queue - * @q: queue - * @pfn: prepare_request function - * - * It's possible for a queue to register a prepare_request callback which - * is invoked before the request is handed to the request_fn. The goal of - * the function is to prepare a request for I/O, it can be used to build a - * cdb from the request data for instance. - * - */ -void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) -{ - q->prep_rq_fn = pfn; -} - -EXPORT_SYMBOL(blk_queue_prep_rq); - -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} - -EXPORT_SYMBOL(blk_queue_merge_bvec); - -void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) -{ - q->softirq_done_fn = fn; -} - -EXPORT_SYMBOL(blk_queue_softirq_done); - -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); - q->make_request_fn = mfn; - q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); - blk_queue_hardsect_size(q, 512); - blk_queue_dma_alignment(q, 511); - blk_queue_congestion_threshold(q); - q->nr_batching = BLK_BATCH_REQ; - - q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ - if (q->unplug_delay == 0) - q->unplug_delay = 1; - - INIT_WORK(&q->unplug_work, blk_unplug_work); - - q->unplug_timer.function = blk_unplug_timeout; - q->unplug_timer.data = (unsigned long)q; - - /* - * by default assume old behaviour and bounce for any highmem page - */ - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); -} - -EXPORT_SYMBOL(blk_queue_make_request); - -static void rq_init(struct request_queue *q, struct request *rq) +void rq_init(struct request_queue *q, struct request *rq) { INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->donelist); @@ -244,960 +123,66 @@ static void rq_init(struct request_queue *q, struct request *rq) rq->end_io = NULL; rq->end_io_data = NULL; rq->completion_data = NULL; - rq->next_rq = NULL; -} - -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * @prepare_flush_fn: rq setup helper for cache flush ordered writes - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn) -{ - if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && - prepare_flush_fn == NULL) { - printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); - return -EINVAL; - } - - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - q->prepare_flush_fn = prepare_flush_fn; - - return 0; -} - -EXPORT_SYMBOL(blk_queue_ordered); - -/* - * Cache flushing for ordered writes handling - */ -inline unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (!blk_fs_request(rq)) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) -{ - struct request *rq; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - - if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) - BUG(); -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, unsigned which) -{ - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - - rq->cmd_flags = REQ_HARDBARRIER; - rq_init(q, rq); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - rq->rq_disk = q->bar_rq.rq_disk; - rq->end_io = end_io; - q->prepare_flush_fn(q, rq); - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static inline struct request *start_ordered(struct request_queue *q, - struct request *rq) -{ - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * Prep proxy barrier request. - */ - blkdev_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = &q->bar_rq; - rq->cmd_flags = 0; - rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; - if (q->ordered & QUEUE_ORDERED_FUA) - rq->cmd_flags |= REQ_FUA; - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->end_io = bar_end_io; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. If this request is - * an empty barrier, we don't need to do a postflush ever since - * there will be no data written between the pre and post flush. - * Hence a single flush will suffice. - */ - if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) - queue_flush(q, QUEUE_ORDERED_POSTFLUSH); - else - q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - - if (q->ordered & QUEUE_ORDERED_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_PREFLUSH); - rq = &q->pre_flush_rq; - } else - q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; - - if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) - q->ordseq |= QUEUE_ORDSEQ_DRAIN; - else - rq = NULL; - - return rq; -} - -int blk_do_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); - - if (!q->ordseq) { - if (!is_barrier) - return 1; - - if (q->next_ordered != QUEUE_ORDERED_NONE) { - *rqp = start_ordered(q, rq); - return 1; - } else { - /* - * This can happen when the queue switches to - * ORDERED_NONE while this request is on it. - */ - blkdev_dequeue_request(rq); - if (__blk_end_request(rq, -EOPNOTSUPP, - blk_rq_bytes(rq))) - BUG(); - *rqp = NULL; - return 0; - } - } - - /* - * Ordered sequence in progress - */ - - /* Special requests are not subject to ordering rules. */ - if (!blk_fs_request(rq) && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return 1; - - if (q->ordered & QUEUE_ORDERED_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } - - return 1; -} - -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) -{ - struct request_queue *q = rq->q; - - if (&q->bar_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(nbytes > bio->bi_size)) { - printk("%s: want %u bytes done, only %u left\n", - __FUNCTION__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } - - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { - - /* - * Okay, this is the barrier request in progress, just - * record the error; - */ - if (error && !q->orderr) - q->orderr = error; - } -} - -/** - * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @dma_addr: bus address limit - * - * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @page. - **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) -{ - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* Assume anything <= 4GB can be handled by IOMMU. - Actually some IOMMUs can handle everything, but I don't - know of a way to test this here. */ - if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->bounce_pfn = max_low_pfn; -#else - if (bounce_pfn < blk_max_low_pfn) - dma = 1; - q->bounce_pfn = bounce_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->bounce_pfn = bounce_pfn; - } -} - -EXPORT_SYMBOL(blk_queue_bounce_limit); - -/** - * blk_queue_max_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit - * - * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. - **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) -{ - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); - printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); - } - - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->max_hw_sectors = q->max_sectors = max_sectors; - else { - q->max_sectors = BLK_DEF_MAX_SECTORS; - q->max_hw_sectors = max_sectors; - } -} - -EXPORT_SYMBOL(blk_queue_max_sectors); - -/** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. - **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_phys_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give as once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_hw_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_hw_segments); - -/** - * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg - * @q: the request queue for the device - * @max_size: max size of segment in bytes - * - * Description: - * Enables a low level driver to set an upper limit on the size of a - * coalesced segment - **/ -void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) -{ - if (max_size < PAGE_CACHE_SIZE) { - max_size = PAGE_CACHE_SIZE; - printk("%s: set to minimum %d\n", __FUNCTION__, max_size); - } - - q->max_segment_size = max_size; -} - -EXPORT_SYMBOL(blk_queue_max_segment_size); - -/** - * blk_queue_hardsect_size - set hardware sector size for the queue - * @q: the request queue for the device - * @size: the hardware sector size, in bytes - * - * Description: - * This should typically be set to the lowest possible sector size - * that the hardware can operate on (possible without reverting to - * even internal read-modify-write operations). Usually the default - * of 512 covers most hardware. - **/ -void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) -{ - q->hardsect_size = size; -} - -EXPORT_SYMBOL(blk_queue_hardsect_size); - -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - -/** - * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers - * @t: the stacking driver (top) - * @b: the underlying device (bottom) - **/ -void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) -{ - /* zero is "infinity" */ - t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); - t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); - - t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); - t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); - t->max_segment_size = min(t->max_segment_size,b->max_segment_size); - t->hardsect_size = max(t->hardsect_size,b->hardsect_size); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); -} - -EXPORT_SYMBOL(blk_queue_stack_limits); - -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * - * @q: the request queue for the device - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, void *buf, - unsigned int size) -{ - if (q->max_hw_segments < 2 || q->max_phys_segments < 2) - return -EINVAL; - /* make room for appending the drain */ - --q->max_hw_segments; - --q->max_phys_segments; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} - -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - -/** - * blk_queue_segment_boundary - set boundary rules for segment merging - * @q: the request queue for the device - * @mask: the memory boundary mask - **/ -void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) -{ - if (mask < PAGE_CACHE_SIZE - 1) { - mask = PAGE_CACHE_SIZE - 1; - printk("%s: set to minimum %lx\n", __FUNCTION__, mask); - } - - q->seg_boundary_mask = mask; -} - -EXPORT_SYMBOL(blk_queue_segment_boundary); - -/** - * blk_queue_dma_alignment - set dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * set required memory and length aligment for direct dma transactions. - * this is used when buiding direct io requests for the queue. - * - **/ -void blk_queue_dma_alignment(struct request_queue *q, int mask) -{ - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_dma_alignment); - -/** - * blk_queue_update_dma_alignment - update dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * update required memory and length aligment for direct dma transactions. - * If the requested alignment is larger than the current alignment, then - * the current queue alignment is updated to the new value, otherwise it - * is left alone. The design of this is to allow multiple objects - * (driver, device, transport etc) to set their respective - * alignments without having them interfere. - * - **/ -void blk_queue_update_dma_alignment(struct request_queue *q, int mask) -{ - BUG_ON(mask > PAGE_SIZE); - - if (mask > q->dma_alignment) - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_update_dma_alignment); - -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - int bit; - - printk("%s: dev %s: type=%x, flags=%x\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); - - printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, - rq->nr_sectors, - rq->current_nr_sectors); - printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); - - if (blk_pc_request(rq)) { - printk("cdb: "); - for (bit = 0; bit < sizeof(rq->cmd); bit++) - printk("%02x ", rq->cmd[bit]); - printk("\n"); - } -} - -EXPORT_SYMBOL(blk_dump_rq_flags); - -void blk_recount_segments(struct request_queue *q, struct bio *bio) -{ - struct request rq; - struct bio *nxt = bio->bi_next; - rq.q = q; - rq.bio = rq.biotail = bio; - bio->bi_next = NULL; - blk_recalc_rq_segments(&rq); - bio->bi_next = nxt; - bio->bi_phys_segments = rq.nr_phys_segments; - bio->bi_hw_segments = rq.nr_hw_segments; - bio->bi_flags |= (1 << BIO_SEG_VALID); -} -EXPORT_SYMBOL(blk_recount_segments); - -static void blk_recalc_rq_segments(struct request *rq) -{ - int nr_phys_segs; - int nr_hw_segs; - unsigned int phys_size; - unsigned int hw_size; - struct bio_vec *bv, *bvprv = NULL; - int seg_size; - int hw_seg_size; - int cluster; - struct req_iterator iter; - int high, highprv = 1; - struct request_queue *q = rq->q; - - if (!rq->bio) - return; - - cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); - hw_seg_size = seg_size = 0; - phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; - rq_for_each_segment(bv, rq, iter) { - /* - * the trick here is making sure that a high page is never - * considered part of another segment, since that might - * change with the bounce page. - */ - high = page_to_pfn(bv->bv_page) > q->bounce_pfn; - if (high || highprv) - goto new_hw_segment; - if (cluster) { - if (seg_size + bv->bv_len > q->max_segment_size) - goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) - goto new_segment; - if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) - goto new_hw_segment; - - seg_size += bv->bv_len; - hw_seg_size += bv->bv_len; - bvprv = bv; - continue; - } -new_segment: - if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && - !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) - hw_seg_size += bv->bv_len; - else { -new_hw_segment: - if (nr_hw_segs == 1 && - hw_seg_size > rq->bio->bi_hw_front_size) - rq->bio->bi_hw_front_size = hw_seg_size; - hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; - nr_hw_segs++; - } - - nr_phys_segs++; - bvprv = bv; - seg_size = bv->bv_len; - highprv = high; - } - - if (nr_hw_segs == 1 && - hw_seg_size > rq->bio->bi_hw_front_size) - rq->bio->bi_hw_front_size = hw_seg_size; - if (hw_seg_size > rq->biotail->bi_hw_back_size) - rq->biotail->bi_hw_back_size = hw_seg_size; - rq->nr_phys_segments = nr_phys_segs; - rq->nr_hw_segments = nr_hw_segs; -} - -static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, - struct bio *nxt) -{ - if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) - return 0; - - if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) - return 0; - if (bio->bi_size + nxt->bi_size > q->max_segment_size) - return 0; - - /* - * bio and nxt are contigous in memory, check if the queue allows - * these two to be merged into one - */ - if (BIO_SEG_BOUNDARY(q, bio, nxt)) - return 1; - - return 0; -} - -static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, - struct bio *nxt) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) - blk_recount_segments(q, nxt); - if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || - BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) - return 0; - if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) - return 0; - - return 1; -} - -/* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries - */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) -{ - struct bio_vec *bvec, *bvprv; - struct req_iterator iter; - struct scatterlist *sg; - int nsegs, cluster; - - nsegs = 0; - cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); - - /* - * for each bio in rq - */ - bvprv = NULL; - sg = NULL; - rq_for_each_segment(bvec, rq, iter) { - int nbytes = bvec->bv_len; - - if (bvprv && cluster) { - if (sg->length + nbytes > q->max_segment_size) - goto new_segment; - - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) - goto new_segment; - - sg->length += nbytes; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - /* - * If the driver previously mapped a shorter - * list, we could see a termination bit - * prematurely unless it fully inits the sg - * table on each mapping. We KNOW that there - * must be more entries here or the driver - * would be buggy, so force clear the - * termination bit to avoid doing a full - * sg_init_table() in drivers for each command. - */ - sg->page_link &= ~0x02; - sg = sg_next(sg); - } - - sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); - nsegs++; - } - bvprv = bvec; - } /* segments in rq */ - - if (q->dma_drain_size) { - sg->page_link &= ~0x02; - sg = sg_next(sg); - sg_set_page(sg, virt_to_page(q->dma_drain_buffer), - q->dma_drain_size, - ((unsigned long)q->dma_drain_buffer) & - (PAGE_SIZE - 1)); - nsegs++; - } - - if (sg) - sg_mark_end(sg); - - return nsegs; -} - -EXPORT_SYMBOL(blk_rq_map_sg); - -/* - * the standard queue merge functions, can be overridden with device - * specific ones if so desired - */ - -static inline int ll_new_mergeable(struct request_queue *q, - struct request *req, - struct bio *bio) -{ - int nr_phys_segs = bio_phys_segments(q, bio); - - if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } - - /* - * A hw segment is just getting larger, bump just the phys - * counter. - */ - req->nr_phys_segments += nr_phys_segs; - return 1; -} - -static inline int ll_new_hw_segment(struct request_queue *q, - struct request *req, - struct bio *bio) -{ - int nr_hw_segs = bio_hw_segments(q, bio); - int nr_phys_segs = bio_phys_segments(q, bio); - - if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments - || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } - - /* - * This will form the start of a new hw segment. Bump both - * counters. - */ - req->nr_hw_segments += nr_hw_segs; - req->nr_phys_segments += nr_phys_segs; - return 1; + rq->next_rq = NULL; } -static int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, int error) { - unsigned short max_sectors; - int len; + struct request_queue *q = rq->q; - if (unlikely(blk_pc_request(req))) - max_sectors = q->max_hw_sectors; - else - max_sectors = q->max_sectors; + if (&q->bar_rq != rq) { + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; - if (req->nr_sectors + bio_sectors(bio) > max_sectors) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } - if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) - blk_recount_segments(q, req->biotail); - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; - if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && - !BIOVEC_VIRT_OVERSIZE(len)) { - int mergeable = ll_new_mergeable(q, req, bio); - - if (mergeable) { - if (req->nr_hw_segments == 1) - req->bio->bi_hw_front_size = len; - if (bio->bi_hw_segments == 1) - bio->bi_hw_back_size = len; + if (unlikely(nbytes > bio->bi_size)) { + printk(KERN_ERR "%s: want %u bytes done, %u left\n", + __FUNCTION__, nbytes, bio->bi_size); + nbytes = bio->bi_size; } - return mergeable; - } - - return ll_new_hw_segment(q, req, bio); -} - -static int ll_front_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) -{ - unsigned short max_sectors; - int len; - - if (unlikely(blk_pc_request(req))) - max_sectors = q->max_hw_sectors; - else - max_sectors = q->max_sectors; + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); + if (bio->bi_size == 0) + bio_endio(bio, error); + } else { - if (req->nr_sectors + bio_sectors(bio) > max_sectors) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } - len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) - blk_recount_segments(q, req->bio); - if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && - !BIOVEC_VIRT_OVERSIZE(len)) { - int mergeable = ll_new_mergeable(q, req, bio); - - if (mergeable) { - if (bio->bi_hw_segments == 1) - bio->bi_hw_front_size = len; - if (req->nr_hw_segments == 1) - req->biotail->bi_hw_back_size = len; - } - return mergeable; + /* + * Okay, this is the barrier request in progress, just + * record the error; + */ + if (error && !q->orderr) + q->orderr = error; } - - return ll_new_hw_segment(q, req, bio); } -static int ll_merge_requests_fn(struct request_queue *q, struct request *req, - struct request *next) +void blk_dump_rq_flags(struct request *rq, char *msg) { - int total_phys_segments; - int total_hw_segments; - - /* - * First check if the either of the requests are re-queued - * requests. Can't merge them if they are. - */ - if (req->special || next->special) - return 0; - - /* - * Will it become too large? - */ - if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) - return 0; + int bit; - total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; - if (blk_phys_contig_segment(q, req->biotail, next->bio)) - total_phys_segments--; + printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, + rq->cmd_flags); - if (total_phys_segments > q->max_phys_segments) - return 0; + printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", + (unsigned long long)rq->sector, + rq->nr_sectors, + rq->current_nr_sectors); + printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", + rq->bio, rq->biotail, + rq->buffer, rq->data, + rq->data_len); - total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; - if (blk_hw_contig_segment(q, req->biotail, next->bio)) { - int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; - /* - * propagate the combined length to the end of the requests - */ - if (req->nr_hw_segments == 1) - req->bio->bi_hw_front_size = len; - if (next->nr_hw_segments == 1) - next->biotail->bi_hw_back_size = len; - total_hw_segments--; + if (blk_pc_request(rq)) { + printk(KERN_INFO " cdb: "); + for (bit = 0; bit < sizeof(rq->cmd); bit++) + printk("%02x ", rq->cmd[bit]); + printk("\n"); } - - if (total_hw_segments > q->max_hw_segments) - return 0; - - /* Merge is OK... */ - req->nr_phys_segments = total_phys_segments; - req->nr_hw_segments = total_hw_segments; - return 1; } +EXPORT_SYMBOL(blk_dump_rq_flags); /* * "plug" the device if there are no outstanding requests: this will @@ -1223,7 +208,6 @@ void blk_plug_device(struct request_queue *q) blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); } } - EXPORT_SYMBOL(blk_plug_device); /* @@ -1240,7 +224,6 @@ int blk_remove_plug(struct request_queue *q) del_timer(&q->unplug_timer); return 1; } - EXPORT_SYMBOL(blk_remove_plug); /* @@ -1285,7 +268,7 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi, blk_unplug(q); } -static void blk_unplug_work(struct work_struct *work) +void blk_unplug_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, unplug_work); @@ -1296,7 +279,7 @@ static void blk_unplug_work(struct work_struct *work) q->unplug_fn(q); } -static void blk_unplug_timeout(unsigned long data) +void blk_unplug_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; @@ -1347,7 +330,6 @@ void blk_start_queue(struct request_queue *q) kblockd_schedule_work(&q->unplug_work); } } - EXPORT_SYMBOL(blk_start_queue); /** @@ -1427,7 +409,7 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_cleanup_queue(struct request_queue * q) +void blk_cleanup_queue(struct request_queue *q) { mutex_lock(&q->sysfs_lock); set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); @@ -1438,7 +420,6 @@ void blk_cleanup_queue(struct request_queue * q) blk_put_queue(q); } - EXPORT_SYMBOL(blk_cleanup_queue); static int blk_init_free_list(struct request_queue *q) @@ -1594,7 +575,6 @@ int blk_get_queue(struct request_queue *q) return 1; } - EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_queue *q, struct request *rq) @@ -1793,7 +773,7 @@ rq_starved: */ if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; - + rq_init(q, rq); blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); @@ -1871,504 +851,100 @@ EXPORT_SYMBOL(blk_get_request); /** * blk_start_queueing - initiate dispatch of requests to device - * @q: request queue to kick into gear - * - * This is basically a helper to remove the need to know whether a queue - * is plugged or not if someone just wants to initiate dispatch of requests - * for this queue. - * - * The queue lock must be held with interrupts disabled. - */ -void blk_start_queueing(struct request_queue *q) -{ - if (!blk_queue_plugged(q)) - q->request_fn(q); - else - __generic_unplug_device(q); -} -EXPORT_SYMBOL(blk_start_queueing); - -/** - * blk_requeue_request - put a request back on queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * - * Description: - * Drivers often keep queueing requests until the hardware cannot accept - * more, when that condition happens we need to put the request back - * on the queue. Must be called with queue lock held. - */ -void blk_requeue_request(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); - - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - elv_requeue_request(q, rq); -} - -EXPORT_SYMBOL(blk_requeue_request); - -/** - * blk_insert_request - insert a special request in to a request queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * @at_head: insert request at head or tail of queue - * @data: private data - * - * Description: - * Many block devices need to execute commands asynchronously, so they don't - * block the whole kernel from preemption during request execution. This is - * accomplished normally by inserting aritficial requests tagged as - * REQ_SPECIAL in to the corresponding request queue, and letting them be - * scheduled for actual execution by the request queue. - * - * We have the option of inserting the head or the tail of the queue. - * Typically we use the tail for new ioctls and so forth. We use the head - * of the queue for things like a QUEUE_FULL message from a device, or a - * host that is unable to accept a particular command. - */ -void blk_insert_request(struct request_queue *q, struct request *rq, - int at_head, void *data) -{ - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - unsigned long flags; - - /* - * tell I/O scheduler that this isn't a regular read/write (ie it - * must not attempt merges on this) and that it acts as a soft - * barrier - */ - rq->cmd_type = REQ_TYPE_SPECIAL; - rq->cmd_flags |= REQ_SOFTBARRIER; - - rq->special = data; - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * If command is tagged, release the tag - */ - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, where, 0); - blk_start_queueing(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -EXPORT_SYMBOL(blk_insert_request); - -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - -int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - if (!rq->bio) - blk_rq_bio_prep(q, rq, bio); - else if (!ll_back_merge_fn(q, rq, bio)) - return -EINVAL; - else { - rq->biotail->bi_next = bio; - rq->biotail = bio; - - rq->data_len += bio->bi_size; - } - return 0; -} -EXPORT_SYMBOL(blk_rq_append_bio); - -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned int len) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) - bio = bio_map_user(q, NULL, uaddr, len, reading); - else - bio = bio_copy_user(q, uaddr, len, reading); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @ubuf: the user buffer - * @len: length of user data - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned long len) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !ubuf) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, ubuf, map_len); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - } - - rq->buffer = rq->data = NULL; - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - return ret; -} - -EXPORT_SYMBOL(blk_rq_map_user); - -/** - * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to map data to - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct sg_iovec *iov, int iov_count, unsigned int len) -{ - struct bio *bio; - - if (!iov || iov_count <= 0) - return -EINVAL; - - /* we don't allow misaligned data like bio_map_user() does. If the - * user is using sg, they're expected to know the alignment constraints - * and respect them accordingly */ - bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (bio->bi_size != len) { - bio_endio(bio, 0); - bio_unmap_user(bio); - return -EINVAL; - } - - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_user_iov); - -/** - * blk_rq_unmap_user - unmap a request with user data - * @bio: start of bio list - * - * Description: - * Unmap a rq previously mapped by blk_rq_map_user(). The caller must - * supply the original rq->bio from the blk_rq_map_user() return, since - * the io completion may have changed rq->bio. - */ -int blk_rq_unmap_user(struct bio *bio) -{ - struct bio *mapped_bio; - int ret = 0, ret2; - - while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; - - mapped_bio = bio; - bio = bio->bi_next; - bio_put(mapped_bio); - } - - return ret; -} - -EXPORT_SYMBOL(blk_rq_unmap_user); - -/** - * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to fill - * @kbuf: the kernel buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) -{ - struct bio *bio; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !kbuf) - return -EINVAL; - - bio = bio_map_kern(q, kbuf, len, gfp_mask); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); - - blk_rq_bio_prep(q, rq, bio); - blk_queue_bounce(q, &rq->bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_kern); - -/** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler + * @q: request queue to kick into gear * - * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution. Don't wait for completion. + * This is basically a helper to remove the need to know whether a queue + * is plugged or not if someone just wants to initiate dispatch of requests + * for this queue. + * + * The queue lock must be held with interrupts disabled. */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) +void blk_start_queueing(struct request_queue *q) { - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - - rq->rq_disk = bd_disk; - rq->cmd_flags |= REQ_NOMERGE; - rq->end_io = done; - WARN_ON(irqs_disabled()); - spin_lock_irq(q->queue_lock); - __elv_add_request(q, rq, where, 1); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); + if (!blk_queue_plugged(q)) + q->request_fn(q); + else + __generic_unplug_device(q); } -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); +EXPORT_SYMBOL(blk_start_queueing); /** - * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue + * blk_requeue_request - put a request back on queue + * @q: request queue where request should be inserted + * @rq: request to be inserted * * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution and wait for completion. + * Drivers often keep queueing requests until the hardware cannot accept + * more, when that condition happens we need to put the request back + * on the queue. Must be called with queue lock held. */ -int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) +void blk_requeue_request(struct request_queue *q, struct request *rq) { - DECLARE_COMPLETION_ONSTACK(wait); - char sense[SCSI_SENSE_BUFFERSIZE]; - int err = 0; - - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - - if (!rq->sense) { - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; - } - - rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); - - if (rq->errors) - err = -EIO; - - return err; -} - -EXPORT_SYMBOL(blk_execute_rq); + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (blk_rq_tagged(rq)) + blk_queue_end_tag(q, rq); - complete(bio->bi_private); + elv_requeue_request(q, rq); } +EXPORT_SYMBOL(blk_requeue_request); /** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @error_sector: error sector + * blk_insert_request - insert a special request in to a request queue + * @q: request queue where request should be inserted + * @rq: request to be inserted + * @at_head: insert request at head or tail of queue + * @data: private data * * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. Caller must run wait_for_completion() on its own. + * Many block devices need to execute commands asynchronously, so they don't + * block the whole kernel from preemption during request execution. This is + * accomplished normally by inserting aritficial requests tagged as + * REQ_SPECIAL in to the corresponding request queue, and letting them be + * scheduled for actual execution by the request queue. + * + * We have the option of inserting the head or the tail of the queue. + * Typically we use the tail for new ioctls and so forth. We use the head + * of the queue for things like a QUEUE_FULL message from a device, or a + * host that is unable to accept a particular command. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +void blk_insert_request(struct request_queue *q, struct request *rq, + int at_head, void *data) { - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; - bio->bi_bdev = bdev; - submit_bio(1 << BIO_RW_BARRIER, bio); - - wait_for_completion(&wait); + int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + unsigned long flags; /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from rq->sector. + * tell I/O scheduler that this isn't a regular read/write (ie it + * must not attempt merges on this) and that it acts as a soft + * barrier */ - if (error_sector) - *error_sector = bio->bi_sector; - - ret = 0; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} + rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_flags |= REQ_SOFTBARRIER; -EXPORT_SYMBOL(blkdev_issue_flush); + rq->special = data; -static void drive_stat_acct(struct request *rq, int new_io) -{ - int rw = rq_data_dir(rq); + spin_lock_irqsave(q->queue_lock, flags); - if (!blk_fs_request(rq) || !rq->rq_disk) - return; + /* + * If command is tagged, release the tag + */ + if (blk_rq_tagged(rq)) + blk_queue_end_tag(q, rq); - if (!new_io) { - __disk_stat_inc(rq->rq_disk, merges[rw]); - } else { - disk_round_stats(rq->rq_disk); - rq->rq_disk->in_flight++; - } + drive_stat_acct(rq, 1); + __elv_add_request(q, rq, where, 0); + blk_start_queueing(q); + spin_unlock_irqrestore(q->queue_lock, flags); } +EXPORT_SYMBOL(blk_insert_request); /* * add-request adds a request to the linked list. * queue lock is held and interrupts disabled, as we muck with the * request queue list. */ -static inline void add_request(struct request_queue * q, struct request * req) +static inline void add_request(struct request_queue *q, struct request *req) { drive_stat_acct(req, 1); @@ -2378,7 +954,7 @@ static inline void add_request(struct request_queue * q, struct request * req) */ __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); } - + /* * disk_round_stats() - Round off the performance stats on a struct * disk_stats. @@ -2408,7 +984,6 @@ void disk_round_stats(struct gendisk *disk) } disk->stamp = now; } - EXPORT_SYMBOL_GPL(disk_round_stats); /* @@ -2438,7 +1013,6 @@ void __blk_put_request(struct request_queue *q, struct request *req) freed_request(q, rw, priv); } } - EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) @@ -2456,108 +1030,9 @@ void blk_put_request(struct request *req) spin_unlock_irqrestore(q->queue_lock, flags); } } - EXPORT_SYMBOL(blk_put_request); -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end io status of the request - */ -void blk_end_sync_rq(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} -EXPORT_SYMBOL(blk_end_sync_rq); - -/* - * Has to be called with the request spinlock acquired - */ -static int attempt_merge(struct request_queue *q, struct request *req, - struct request *next) -{ - if (!rq_mergeable(req) || !rq_mergeable(next)) - return 0; - - /* - * not contiguous - */ - if (req->sector + req->nr_sectors != next->sector) - return 0; - - if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk - || next->special) - return 0; - - /* - * If we are allowed to merge, then append bio list - * from next to rq and release next. merge_requests_fn - * will have updated segment counts, update sector - * counts here. - */ - if (!ll_merge_requests_fn(q, req, next)) - return 0; - - /* - * At this point we have either done a back merge - * or front merge. We need the smaller start_time of - * the merged requests to be the current request - * for accounting purposes. - */ - if (time_after(req->start_time, next->start_time)) - req->start_time = next->start_time; - - req->biotail->bi_next = next->bio; - req->biotail = next->biotail; - - req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; - - elv_merge_requests(q, req, next); - - if (req->rq_disk) { - disk_round_stats(req->rq_disk); - req->rq_disk->in_flight--; - } - - req->ioprio = ioprio_best(req->ioprio, next->ioprio); - - __blk_put_request(q, next); - return 1; -} - -static inline int attempt_back_merge(struct request_queue *q, - struct request *rq) -{ - struct request *next = elv_latter_request(q, rq); - - if (next) - return attempt_merge(q, rq, next); - - return 0; -} - -static inline int attempt_front_merge(struct request_queue *q, - struct request *rq) -{ - struct request *prev = elv_former_request(q, rq); - - if (prev) - return attempt_merge(q, prev, rq); - - return 0; -} - -static void init_request_from_bio(struct request *req, struct bio *bio) +void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -2615,53 +1090,53 @@ static int __make_request(struct request_queue *q, struct bio *bio) el_ret = elv_merge(q, &req, bio); switch (el_ret) { - case ELEVATOR_BACK_MERGE: - BUG_ON(!rq_mergeable(req)); + case ELEVATOR_BACK_MERGE: + BUG_ON(!rq_mergeable(req)); - if (!ll_back_merge_fn(q, req, bio)) - break; + if (!ll_back_merge_fn(q, req, bio)) + break; - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); - req->biotail->bi_next = bio; - req->biotail = bio; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; - req->ioprio = ioprio_best(req->ioprio, prio); - drive_stat_acct(req, 0); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; + req->biotail->bi_next = bio; + req->biotail = bio; + req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); + drive_stat_acct(req, 0); + if (!attempt_back_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out; - case ELEVATOR_FRONT_MERGE: - BUG_ON(!rq_mergeable(req)); + case ELEVATOR_FRONT_MERGE: + BUG_ON(!rq_mergeable(req)); - if (!ll_front_merge_fn(q, req, bio)) - break; + if (!ll_front_merge_fn(q, req, bio)) + break; - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); - bio->bi_next = req->bio; - req->bio = bio; + bio->bi_next = req->bio; + req->bio = bio; - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); - req->current_nr_sectors = bio_cur_sectors(bio); - req->hard_cur_sectors = req->current_nr_sectors; - req->sector = req->hard_sector = bio->bi_sector; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; - req->ioprio = ioprio_best(req->ioprio, prio); - drive_stat_acct(req, 0); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - /* ELV_NO_MERGE: elevator says don't/can't merge. */ - default: - ; + /* + * may not be valid. if the low level driver said + * it didn't need a bounce buffer then it better + * not touch req->buffer either... + */ + req->buffer = bio_data(bio); + req->current_nr_sectors = bio_cur_sectors(bio); + req->hard_cur_sectors = req->current_nr_sectors; + req->sector = req->hard_sector = bio->bi_sector; + req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); + drive_stat_acct(req, 0); + if (!attempt_front_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out; + + /* ELV_NO_MERGE: elevator says don't/can't merge. */ + default: + ; } get_rq: @@ -2869,7 +1344,7 @@ end_io: } if (unlikely(nr_sectors > q->max_hw_sectors)) { - printk("bio too big device %s (%u > %u)\n", + printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), q->max_hw_sectors); @@ -2958,7 +1433,6 @@ void generic_make_request(struct bio *bio) } while (bio); current->bio_tail = NULL; /* deactivate */ } - EXPORT_SYMBOL(generic_make_request); /** @@ -2999,44 +1473,14 @@ void submit_bio(int rw, struct bio *bio) current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, - bdevname(bio->bi_bdev,b)); + bdevname(bio->bi_bdev, b)); } } generic_make_request(bio); } - EXPORT_SYMBOL(submit_bio); -static void blk_recalc_rq_sectors(struct request *rq, int nsect) -{ - if (blk_fs_request(rq)) { - rq->hard_sector += nsect; - rq->hard_nr_sectors -= nsect; - - /* - * Move the I/O submission pointers ahead if required. - */ - if ((rq->nr_sectors >= rq->hard_nr_sectors) && - (rq->sector <= rq->hard_sector)) { - rq->sector = rq->hard_sector; - rq->nr_sectors = rq->hard_nr_sectors; - rq->hard_cur_sectors = bio_cur_sectors(rq->bio); - rq->current_nr_sectors = rq->hard_cur_sectors; - rq->buffer = bio_data(rq->bio); - } - - /* - * if total number of sectors is less than the first segment - * size, something has gone terribly wrong - */ - if (rq->nr_sectors < rq->current_nr_sectors) { - printk("blk: request botched\n"); - rq->nr_sectors = rq->current_nr_sectors; - } - } -} - /** * __end_that_request_first - end I/O on a request * @req: the request being processed @@ -3066,9 +1510,8 @@ static int __end_that_request_first(struct request *req, int error, if (!blk_pc_request(req)) req->errors = 0; - if (error) { - if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) - printk("end_request: I/O error, dev %s, sector %llu\n", + if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { + printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)req->sector); } @@ -3102,9 +1545,9 @@ static int __end_that_request_first(struct request *req, int error, if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { blk_dump_rq_flags(req, "__end_that"); - printk("%s: bio idx %d >= vcnt %d\n", - __FUNCTION__, - bio->bi_idx, bio->bi_vcnt); + printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", + __FUNCTION__, bio->bi_idx, + bio->bi_vcnt); break; } @@ -3130,7 +1573,8 @@ static int __end_that_request_first(struct request *req, int error, total_bytes += nbytes; nr_bytes -= nbytes; - if ((bio = req->bio)) { + bio = req->bio; + if (bio) { /* * end more in this run, or just return 'not-done' */ @@ -3174,15 +1618,16 @@ static void blk_done_softirq(struct softirq_action *h) local_irq_enable(); while (!list_empty(&local_list)) { - struct request *rq = list_entry(local_list.next, struct request, donelist); + struct request *rq; + rq = list_entry(local_list.next, struct request, donelist); list_del_init(&rq->donelist); rq->q->softirq_done_fn(rq); } } -static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int __cpuinit blk_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { /* * If a CPU goes away, splice its entries to the current CPU @@ -3224,7 +1669,7 @@ void blk_complete_request(struct request *req) unsigned long flags; BUG_ON(!req->q->softirq_done_fn); - + local_irq_save(flags); cpu_list = &__get_cpu_var(blk_cpu_done); @@ -3233,9 +1678,8 @@ void blk_complete_request(struct request *req) local_irq_restore(flags); } - EXPORT_SYMBOL(blk_complete_request); - + /* * queue lock must be held */ @@ -3394,8 +1838,9 @@ EXPORT_SYMBOL(end_request); * 0 - we are done with this request * 1 - this request is not freed yet, it still has pending buffers. **/ -static int blk_end_io(struct request *rq, int error, int nr_bytes, - int bidi_bytes, int (drv_callback)(struct request *)) +static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, + unsigned int bidi_bytes, + int (drv_callback)(struct request *)) { struct request_queue *q = rq->q; unsigned long flags = 0UL; @@ -3437,7 +1882,7 @@ static int blk_end_io(struct request *rq, int error, int nr_bytes, * 0 - we are done with this request * 1 - still buffers pending for this request **/ -int blk_end_request(struct request *rq, int error, int nr_bytes) +int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return blk_end_io(rq, error, nr_bytes, 0, NULL); } @@ -3456,7 +1901,7 @@ EXPORT_SYMBOL_GPL(blk_end_request); * 0 - we are done with this request * 1 - still buffers pending for this request **/ -int __blk_end_request(struct request *rq, int error, int nr_bytes) +int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { if (blk_fs_request(rq) || blk_pc_request(rq)) { if (__end_that_request_first(rq, error, nr_bytes)) @@ -3485,8 +1930,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request); * 0 - we are done with this request * 1 - still buffers pending for this request **/ -int blk_end_bidi_request(struct request *rq, int error, int nr_bytes, - int bidi_bytes) +int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, + unsigned int bidi_bytes) { return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); } @@ -3517,15 +1962,16 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request); * this request still has pending buffers or * the driver doesn't want to finish this request yet. **/ -int blk_end_request_callback(struct request *rq, int error, int nr_bytes, +int blk_end_request_callback(struct request *rq, int error, + unsigned int nr_bytes, int (drv_callback)(struct request *)) { return blk_end_io(rq, error, nr_bytes, 0, drv_callback); } EXPORT_SYMBOL_GPL(blk_end_request_callback); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio) +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio) { /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= (bio->bi_rw & 3); @@ -3548,7 +1994,6 @@ int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } - EXPORT_SYMBOL(kblockd_schedule_work); void kblockd_flush_work(struct work_struct *work) @@ -3571,188 +2016,12 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - iocontext_cachep = kmem_cache_create("blkdev_ioc", - sizeof(struct io_context), 0, SLAB_PANIC, NULL); - for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); register_hotcpu_notifier(&blk_cpu_notifier); - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - - return 0; -} - -static void cfq_dtor(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - /* - * We don't have a specific key to lookup with, so use the gang - * lookup to just retrieve the first item stored. The cfq exit - * function will iterate the full tree, so any member will do. - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - if (r > 0) - cic[0]->dtor(ioc); -} - -/* - * IO Context helper functions. put_io_context() returns 1 if there are no - * more users of this io context, 0 otherwise. - */ -int put_io_context(struct io_context *ioc) -{ - if (ioc == NULL) - return 1; - - BUG_ON(atomic_read(&ioc->refcount) == 0); - - if (atomic_dec_and_test(&ioc->refcount)) { - rcu_read_lock(); - if (ioc->aic && ioc->aic->dtor) - ioc->aic->dtor(ioc->aic); - rcu_read_unlock(); - cfq_dtor(ioc); - - kmem_cache_free(iocontext_cachep, ioc); - return 1; - } return 0; } -EXPORT_SYMBOL(put_io_context); - -static void cfq_exit(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - rcu_read_lock(); - /* - * See comment for cfq_dtor() - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - rcu_read_unlock(); - - if (r > 0) - cic[0]->exit(ioc); -} - -/* Called by the exitting task */ -void exit_io_context(void) -{ - struct io_context *ioc; - - task_lock(current); - ioc = current->io_context; - current->io_context = NULL; - task_unlock(current); - - if (atomic_dec_and_test(&ioc->nr_tasks)) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); - cfq_exit(ioc); - - put_io_context(ioc); - } -} - -struct io_context *alloc_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret; - - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - ret->ioc_data = NULL; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * Otherwise, return its existing IO context. - * - * This returned IO context doesn't have a specifically elevated refcount, - * but since the current task itself holds a reference, the context can be - * used in general code, so long as it stays within `current` context. - */ -static struct io_context *current_io_context(gfp_t gfp_flags, int node) -{ - struct task_struct *tsk = current; - struct io_context *ret; - - ret = tsk->io_context; - if (likely(ret)) - return ret; - - ret = alloc_io_context(gfp_flags, node); - if (ret) { - /* make sure set_task_ioprio() sees the settings above */ - smp_wmb(); - tsk->io_context = ret; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. - * - * This is always called in the context of the task which submitted the I/O. - */ -struct io_context *get_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret = NULL; - - /* - * Check for unlikely race with exiting task. ioc ref count is - * zero when ioc is being detached. - */ - do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) - break; - } while (!atomic_inc_not_zero(&ret->refcount)); - - return ret; -} -EXPORT_SYMBOL(get_io_context); - -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_read(&src->refcount) == 0); - atomic_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - -void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) -{ - struct io_context *temp; - temp = *ioc1; - *ioc1 = *ioc2; - *ioc2 = temp; -} -EXPORT_SYMBOL(swap_io_context);