Merge master.kernel.org:/pub/scm/linux/kernel/git/davej/cpufreq

[linux-2.6-omap-h63xx.git] / drivers / block / lguest_blk.c
diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c

index 1634c2dd25ec5780d5bcd7b8adbf050987997d06..fa8e42341b87ba9f938b1b67f9b2a5b301c9f54c 100644 (file)
--- a/drivers/block/lguest_blk.c
+++ b/drivers/block/lguest_blk.c
@@ -1,6 +1,12 @@
-/* A simple block driver for lguest.
+/*D:400
+ * The Guest block driver
   *
- * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
+ * The mechanism is simple: we place the information about the request in the
+ * device page, then use SEND_DMA (containing the data for a write, or an empty
+ * "ping" DMA for a read).
+ :*/
+/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -25,27 +31,50 @@
  
  static char next_block_index = 'a';
  
+/*D:420 Here is the structure which holds all the information we need about
+ * each Guest block device.
+ *
+ * I'm sure at this stage, you're wondering "hey, where was the adventure I was
+ * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
+ * my blog".  I think Real adventures have boring bits, too, and you're in the
+ * middle of one.  But it gets better.  Just not quite yet. */
  struct blockdev
  {
+       /* The block queue infrastructure wants a spinlock: it is held while it
+        * calls our block request function.  We grab it in our interrupt
+        * handler so the responses don't mess with new requests. */
         spinlock_t lock;
  
-       /* The disk structure for the kernel. */
+       /* The disk structure registered with kernel. */
         struct gendisk *disk;
  
-       /* The major number for this disk. */
+       /* The major device number for this disk, and the interrupt.  We only
+        * really keep them here for completeness; we'd need them if we
+        * supported device unplugging. */
         int major;
         int irq;
  
+       /* The physical address of this device's memory page */
         unsigned long phys_addr;
-       /* The mapped block page. */
+       /* The mapped memory page for convenient acces. */
         struct lguest_block_page *lb_page;
  
-       /* We only have a single request outstanding at a time. */
+       /* We only have a single request outstanding at a time: this is it. */
         struct lguest_dma dma;
         struct request *req;
  };
  
-/* Jens gave me this nice helper to end all chunks of a request. */
+/*D:495 We originally used end_request() throughout the driver, but it turns
+ * out that end_request() is deprecated, and doesn't actually end the request
+ * (which seems like a good reason to deprecate it!).  It simply ends the first
+ * bio.  So if we had 3 bios in a "struct request" we would do all 3,
+ * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
+ * work as we needed to do.
+ *
+ * This reinforced to me that I do not understand the block layer.
+ *
+ * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
+ * request.  This improved disk speed by 130%. */
  static void end_entire_request(struct request *req, int uptodate)
  {
         if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
@@ -55,67 +84,115 @@ static void end_entire_request(struct request *req, int uptodate)
         end_that_request_last(req, uptodate);
  }
  
+/* I'm told there are only two stories in the world worth telling: love and
+ * hate.  So there used to be a love scene here like this:
+ *
+ *  Launcher:  We could make beautiful I/O together, you and I.
+ *  Guest:     My, that's a big disk!
+ *
+ * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
+
+/*D:490 This is the interrupt handler, called when a block read or write has
+ * been completed for us. */
  static irqreturn_t lgb_irq(int irq, void *_bd)
  {
+       /* We handed our "struct blockdev" as the argument to request_irq(), so
+        * it is passed through to us here.  This tells us which device we're
+        * dealing with in case we have more than one. */
         struct blockdev *bd = _bd;
         unsigned long flags;
  
+       /* We weren't doing anything?  Strange, but could happen if we shared
+        * interrupts (we don't!). */
         if (!bd->req) {
                 pr_debug("No work!\n");
                 return IRQ_NONE;
         }
  
+       /* Not done yet?  That's equally strange. */
         if (!bd->lb_page->result) {
                 pr_debug("No result!\n");
                 return IRQ_NONE;
         }
  
+       /* We have to grab the lock before ending the request. */
         spin_lock_irqsave(&bd->lock, flags);
+       /* "result" is 1 for success, 2 for failure: end_entire_request() wants
+        * to know whether this succeeded or not. */
         end_entire_request(bd->req, bd->lb_page->result == 1);
+       /* Clear out request, it's done. */
         bd->req = NULL;
+       /* Reset incoming DMA for next time. */
         bd->dma.used_len = 0;
+       /* Ready for more reads or writes */
         blk_start_queue(bd->disk->queue);
         spin_unlock_irqrestore(&bd->lock, flags);
+
+       /* The interrupt was for us, we dealt with it. */
         return IRQ_HANDLED;
  }
  
+/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
+ * each of which contains "struct bio_vec"s, each of which contains a page, an
+ * offset and a length.
+ *
+ * Fortunately there are iterators to help us walk through the "struct
+ * request".  Even more fortunately, there were plenty of places to steal the
+ * code from.  We pack the "struct request" into our "struct lguest_dma" and
+ * return the total length. */
  static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
  {
-       unsigned int i = 0, idx, len = 0;
-       struct bio *bio;
-
-       rq_for_each_bio(bio, req) {
-               struct bio_vec *bvec;
-               bio_for_each_segment(bvec, bio, idx) {
-                       BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
-                       BUG_ON(!bvec->bv_len);
-                       dma->addr[i] = page_to_phys(bvec->bv_page)
-                               + bvec->bv_offset;
-                       dma->len[i] = bvec->bv_len;
-                       len += bvec->bv_len;
-                       i++;
-               }
+       unsigned int i = 0, len = 0;
+       struct req_iterator iter;
+       struct bio_vec *bvec;
+
+       rq_for_each_segment(bvec, req, iter) {
+               /* We told the block layer not to give us too many. */
+               BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
+               /* If we had a zero-length segment, it would look like
+                * the end of the data referred to by the "struct
+                * lguest_dma", so make sure that doesn't happen. */
+               BUG_ON(!bvec->bv_len);
+               /* Convert page & offset to a physical address */
+               dma->addr[i] = page_to_phys(bvec->bv_page)
+                       + bvec->bv_offset;
+               dma->len[i] = bvec->bv_len;
+               len += bvec->bv_len;
+               i++;
         }
+       /* If the array isn't full, we mark the end with a 0 length */
         if (i < LGUEST_MAX_DMA_SECTIONS)
                 dma->len[i] = 0;
         return len;
  }
  
+/* This creates an empty DMA, useful for prodding the Host without sending data
+ * (ie. when we want to do a read) */
  static void empty_dma(struct lguest_dma *dma)
  {
         dma->len[0] = 0;
  }
  
+/*D:470 Setting up a request is fairly easy: */
  static void setup_req(struct blockdev *bd,
                       int type, struct request *req, struct lguest_dma *dma)
  {
+       /* The type is 1 (write) or 0 (read). */
         bd->lb_page->type = type;
+       /* The sector on disk where the read or write starts. */
         bd->lb_page->sector = req->sector;
+       /* The result is initialized to 0 (unfinished). */
         bd->lb_page->result = 0;
+       /* The current request (so we can end it in the interrupt handler). */
         bd->req = req;
+       /* The number of bytes: returned as a side-effect of req_to_dma(),
+        * which packs the block layer's "struct request" into our "struct
+        * lguest_dma" */
         bd->lb_page->bytes = req_to_dma(req, dma);
  }
  
+/*D:450 Write is pretty straightforward: we pack the request into a "struct
+ * lguest_dma", then use SEND_DMA to send the request. */
  static void do_write(struct blockdev *bd, struct request *req)
  {
         struct lguest_dma send;
@@ -126,6 +203,9 @@ static void do_write(struct blockdev *bd, struct request *req)
         lguest_send_dma(bd->phys_addr, &send);
  }
  
+/* Read is similar to write, except we pack the request into our receive
+ * "struct lguest_dma" and send through an empty DMA just to tell the Host that
+ * there's a request pending. */
  static void do_read(struct blockdev *bd, struct request *req)
  {
         struct lguest_dma ping;
@@ -137,21 +217,30 @@ static void do_read(struct blockdev *bd, struct request *req)
         lguest_send_dma(bd->phys_addr, &ping);
  }
  
-static void do_lgb_request(request_queue_t *q)
+/*D:440 This where requests come in: we get handed the request queue and are
+ * expected to pull a "struct request" off it until we've finished them or
+ * we're waiting for a reply: */
+static void do_lgb_request(struct request_queue *q)
  {
         struct blockdev *bd;
         struct request *req;
  
  again:
+       /* This sometimes returns NULL even on the very first time around.  I
+        * wonder if it's something to do with letting elves handle the request
+        * queue... */
         req = elv_next_request(q);
         if (!req)
                 return;
  
+       /* We attached the struct blockdev to the disk: get it back */
         bd = req->rq_disk->private_data;
-       /* Sometimes we get repeated requests after blk_stop_queue. */
+       /* Sometimes we get repeated requests after blk_stop_queue(), but we
+        * can only handle one at a time. */
         if (bd->req)
                 return;
  
+       /* We only do reads and writes: no tricky business! */
         if (!blk_fs_request(req)) {
                 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
                 req->errors++;
@@ -164,20 +253,31 @@ again:
         else
                 do_read(bd, req);
  
-       /* Wait for interrupt to tell us it's done. */
+       /* We've put out the request, so stop any more coming in until we get
+        * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
         blk_stop_queue(q);
  }
  
+/*D:430 This is the "struct block_device_operations" we attach to the disk at
+ * the end of lguestblk_probe().  It doesn't seem to want much. */
  static struct block_device_operations lguestblk_fops = {
         .owner = THIS_MODULE,
  };
  
+/*D:425 Setting up a disk device seems to involve a lot of code.  I'm not sure
+ * quite why.  I do know that the IDE code sent two or three of the maintainers
+ * insane, perhaps this is the fringe of the same disease?
+ *
+ * As in the console code, the probe function gets handed the generic
+ * lguest_device from lguest_bus.c: */
  static int lguestblk_probe(struct lguest_device *lgdev)
  {
         struct blockdev *bd;
         int err;
         int irqflags = IRQF_SHARED;
  
+       /* First we allocate our own "struct blockdev" and initialize the easy
+        * fields. */
         bd = kmalloc(sizeof(*bd), GFP_KERNEL);
         if (!bd)
                 return -ENOMEM;
@@ -187,59 +287,103 @@ static int lguestblk_probe(struct lguest_device *lgdev)
         bd->req = NULL;
         bd->dma.used_len = 0;
         bd->dma.len[0] = 0;
+       /* The descriptor in the lguest_devices array provided by the Host
+        * gives the Guest the physical page number of the device's page. */
         bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
  
+       /* We use lguest_map() to get a pointer to the device page */
         bd->lb_page = lguest_map(bd->phys_addr, 1);
         if (!bd->lb_page) {
                 err = -ENOMEM;
                 goto out_free_bd;
         }
  
+       /* We need a major device number: 0 means "assign one dynamically". */
         bd->major = register_blkdev(0, "lguestblk");
         if (bd->major < 0) {
                 err = bd->major;
                 goto out_unmap;
         }
  
-       bd->disk = alloc_disk(1);
+       /* This allocates a "struct gendisk" where we pack all the information
+        * about the disk which the rest of Linux sees.  The argument is the
+        * number of minor devices desired: we need one minor for the main
+        * disk, and one for each partition.  Of course, we can't possibly know
+        * how many partitions are on the disk (add_disk does that).
+        */
+       bd->disk = alloc_disk(16);
         if (!bd->disk) {
                 err = -ENOMEM;
                 goto out_unregister_blkdev;
         }
  
+       /* Every disk needs a queue for requests to come in: we set up the
+        * queue with a callback function (the core of our driver) and the lock
+        * to use. */
         bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
         if (!bd->disk->queue) {
                 err = -ENOMEM;
                 goto out_put_disk;
         }
  
-       /* We can only handle a certain number of sg entries */
+       /* We can only handle a certain number of pointers in our SEND_DMA
+        * call, so we set that with blk_queue_max_hw_segments().  This is not
+        * to be confused with blk_queue_max_phys_segments() of course!  I
+        * know, who could possibly confuse the two?
+        *
+        * Well, it's simple to tell them apart: this one seems to work and the
+        * other one didn't. */
         blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
-       /* Buffers must not cross page boundaries */
+
+       /* Due to technical limitations of our Host (and simple coding) we
+        * can't have a single buffer which crosses a page boundary.  Tell it
+        * here.  This means that our maximum request size is 16
+        * (LGUEST_MAX_DMA_SECTIONS) pages. */
         blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
  
+       /* We name our disk: this becomes the device name when udev does its
+        * magic thing and creates the device node, such as /dev/lgba.
+        * next_block_index is a global which starts at 'a'.  Unfortunately
+        * this simple increment logic means that the 27th disk will be called
+        * "/dev/lgb{".  In that case, I recommend having at least 29 disks, so
+        * your /dev directory will be balanced. */
         sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
+
+       /* We look to the device descriptor again to see if this device's
+        * interrupts are expected to be random.  If they are, we tell the irq
+        * subsystem.  At the moment this bit is always set. */
         if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
                 irqflags |= IRQF_SAMPLE_RANDOM;
+
+       /* Now we have the name and irqflags, we can request the interrupt; we
+        * give it the "struct blockdev" we have set up to pass to lgb_irq()
+        * when there is an interrupt. */
         err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
         if (err)
                 goto out_cleanup_queue;
  
+       /* We bind our one-entry DMA pool to the key for this block device so
+        * the Host can reply to our requests.  The key is equal to the
+        * physical address of the device's page, which is conveniently
+        * unique. */
         err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
         if (err)
                 goto out_free_irq;
  
+       /* We finish our disk initialization and add the disk to the system. */
         bd->disk->major = bd->major;
         bd->disk->first_minor = 0;
         bd->disk->private_data = bd;
         bd->disk->fops = &lguestblk_fops;
-       /* This is initialized to the disk size by the other end. */
+       /* This is initialized to the disk size by the Launcher. */
         set_capacity(bd->disk, bd->lb_page->num_sectors);
         add_disk(bd->disk);
  
         printk(KERN_INFO "%s: device %i at major %d\n",
                bd->disk->disk_name, lgdev->index, bd->major);
  
+       /* We don't need to keep the "struct blockdev" around, but if we ever
+        * implemented device removal, we'd need this. */
         lgdev->private = bd;
         return 0;
  
@@ -258,6 +402,8 @@ out_free_bd:
         return err;
  }
  
+/*D:410 The boilerplate code for registering the lguest block driver is just
+ * like the console: */
  static struct lguest_driver lguestblk_drv = {
         .name = "lguestblk",
         .owner = THIS_MODULE,