X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=Documentation%2Flguest%2Flguest.c;h=0f23d67f958ff5b6ee96a0248fdd2c9b9ab65586;hb=c5974932c1e8514d3478573bb52beebeb2c786dd;hp=f2668390e8f773276357917e79625790d4e98b5a;hpb=4bc3e17cce5662092703b02ee7b030047b1c99b3;p=linux-2.6-omap-h63xx.git diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index f2668390e8f..0f23d67f958 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include "linux/lguest_launcher.h" #include "linux/virtio_config.h" #include "linux/virtio_net.h" @@ -62,8 +64,8 @@ typedef uint8_t u8; #endif /* We can have up to 256 pages for devices. */ #define DEVICE_PAGES 256 -/* This fits nicely in a single 4096-byte page. */ -#define VIRTQUEUE_NUM 127 +/* This will occupy 2 pages: it must be a power of 2. */ +#define VIRTQUEUE_NUM 128 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows * this, and although I wouldn't recommend it, it works quite nicely here. */ @@ -79,6 +81,9 @@ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; +/* a per-cpu variable indicating whose vcpu is currently running */ +static unsigned int __thread cpu_id; + /* This is our list of devices. */ struct device_list { @@ -96,13 +101,11 @@ struct device_list /* The descriptor page for the devices. */ u8 *descpage; - /* The tail of the last descriptor. */ - unsigned int desc_used; - /* A single linked list of devices. */ struct device *dev; - /* ... And an end pointer so we can easily append new devices */ - struct device **lastdev; + /* And a pointer to the last device for easy append and also for + * configuration appending. */ + struct device *lastdev; }; /* The list of Guest devices, based on command line arguments. */ @@ -153,6 +156,9 @@ struct virtqueue void (*handle_output)(int fd, struct virtqueue *me); }; +/* Remember the arguments to the program so we can "reboot" */ +static char **main_args; + /* Since guest is UP and we don't run at the same time, we don't need barriers. * But I include them in the code in case others copy it. */ #define wmb() @@ -185,7 +191,14 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, #define cpu_to_le64(v64) (v64) #define le16_to_cpu(v16) (v16) #define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v32) (v64) +#define le64_to_cpu(v64) (v64) + +/* The device virtqueue descriptors are followed by feature bitmasks. */ +static u8 *get_feature_bits(struct device *dev) +{ + return (u8 *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig); +} /*L:100 The Launcher code itself takes us out into userspace, that scary place * where pointers run wild and free! Unfortunately, like most userspace @@ -554,7 +567,7 @@ static void wake_parent(int pipefd, int lguest_fd) else FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ - write(lguest_fd, args, sizeof(args)); + pwrite(lguest_fd, args, sizeof(args), cpu_id); } } @@ -908,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } +/* Resetting a device is fairly easy. */ +static void reset_device(struct device *dev) +{ + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + /* Clear the status. */ + dev->desc->status = 0; + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->desc->feature_len, 0, + dev->desc->feature_len); + + /* Zero out the virtqueues. */ + for (vq = dev->vq; vq; vq = vq->next) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, getpagesize())); + vq->last_avail_idx = 0; + } +} + /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ static void handle_output(int fd, unsigned long addr) { struct device *i; struct virtqueue *vq; - /* Check each virtqueue. */ + /* Check each device and virtqueue. */ for (i = devices.dev; i; i = i->next) { + /* Notifications to device descriptors reset the device. */ + if (from_guest_phys(addr) == i->desc) { + reset_device(i); + return; + } + + /* Notifications to virtqueues mean output has occurred. */ for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn == addr/getpagesize() - && vq->handle_output) { - verbose("Output to %s\n", vq->dev->name); - vq->handle_output(fd, vq); + if (vq->config.pfn != addr/getpagesize()) + continue; + + /* Guest should acknowledge (and set features!) before + * using the device. */ + if (i->desc->status == 0) { + warnx("%s gave early output", i->name); return; } + + if (strcmp(vq->dev->name, "console") != 0) + verbose("Output to %s\n", vq->dev->name); + if (vq->handle_output) + vq->handle_output(fd, vq); + return; } } @@ -980,54 +1030,44 @@ static void handle_input(int fd) * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper - * routines to allocate them. - * - * This routine allocates a new "struct lguest_device_desc" from descriptor - * table just above the Guest's normal memory. It returns a pointer to that - * descriptor. */ -static struct lguest_device_desc *new_dev_desc(u16 type) -{ - struct lguest_device_desc *d; - - /* We only have one page for all the descriptors. */ - if (devices.desc_used + sizeof(*d) > getpagesize()) - errx(1, "Too many devices"); + * routines to allocate and manage them. */ - /* We don't need to set config_len or status: page is 0 already. */ - d = (void *)devices.descpage + devices.desc_used; - d->type = type; - devices.desc_used += sizeof(*d); - - return d; +/* The layout of the device page is a "struct lguest_device_desc" followed by a + * number of virtqueue descriptors, then two sets of feature bits, then an + * array of configuration bytes. This routine returns the configuration + * pointer. */ +static u8 *device_config(const struct device *dev) +{ + return (void *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig) + + dev->desc->feature_len * 2; } -/* Each device descriptor is followed by some configuration information. - * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. - * - * This routine adds a new field to an existing device's descriptor. It only - * works for the last device, but that's OK because that's how we use it. */ -static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) +/* This routine allocates a new "struct lguest_device_desc" from descriptor + * table page just above the Guest's normal memory. It returns a pointer to + * that descriptor. */ +static struct lguest_device_desc *new_dev_desc(u16 type) { - /* This is the last descriptor, right? */ - assert(devices.descpage + devices.desc_used - == (u8 *)(dev->desc + 1) + dev->desc->config_len); + struct lguest_device_desc d = { .type = type }; + void *p; - /* We only have one page of device descriptions. */ - if (devices.desc_used + 2 + len > getpagesize()) - errx(1, "Too many devices"); + /* Figure out where the next device config is, based on the last one. */ + if (devices.lastdev) + p = device_config(devices.lastdev) + + devices.lastdev->desc->config_len; + else + p = devices.descpage; - /* Copy in the new config header: type then length. */ - devices.descpage[devices.desc_used++] = type; - devices.descpage[devices.desc_used++] = len; - memcpy(devices.descpage + devices.desc_used, c, len); - devices.desc_used += len; + /* We only have one page for all the descriptors. */ + if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) + errx(1, "Too many devices"); - /* Update the device descriptor length: two byte head then data. */ - dev->desc->config_len += 2 + len; + /* p might not be aligned, so we memcpy in. */ + return memcpy(p, &d, sizeof(d)); } -/* This routine adds a virtqueue to a device. We specify how many descriptors - * the virtqueue is to have. */ +/* Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*handle_output)(int fd, struct virtqueue *me)) { @@ -1036,38 +1076,77 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, void *p; /* First we need some pages for this virtqueue. */ - pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); + pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) + / getpagesize(); p = get_pages(pages); + /* Initialize the virtqueue */ + vq->next = NULL; + vq->last_avail_idx = 0; + vq->dev = dev; + /* Initialize the configuration. */ vq->config.num = num_descs; vq->config.irq = devices.next_irq++; vq->config.pfn = to_guest_phys(p) / getpagesize(); /* Initialize the vring. */ - vring_init(&vq->vring, num_descs, p); + vring_init(&vq->vring, num_descs, p, getpagesize()); + + /* Append virtqueue to this device's descriptor. We use + * device_config() to get the end of the device's current virtqueues; + * we check that we haven't added any config or feature information + * yet, otherwise we'd be overwriting them. */ + assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); + memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->desc->num_vq++; - /* Add the configuration information to this device's descriptor. */ - add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, - sizeof(vq->config), &vq->config); + verbose("Virtqueue page %#lx\n", to_guest_phys(p)); /* Add to tail of list, so dev->vq is first vq, dev->vq->next is * second. */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; - /* Link virtqueue back to device. */ - vq->dev = dev; - /* Set the routine to call when the Guest does something to this * virtqueue. */ vq->handle_output = handle_output; - /* Set the "Don't Notify Me" flag if we don't have a handler */ + /* As an optimization, set the advisory "Don't Notify Me" flag if we + * don't have a handler */ if (!handle_output) vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } +/* The first half of the feature bitmask is for us to advertise features. The + * second half if for the Guest to accept features. */ +static void add_feature(struct device *dev, unsigned bit) +{ + u8 *features = get_feature_bits(dev); + + /* We can't extend the feature bits once we've added config bytes */ + if (dev->desc->feature_len <= bit / CHAR_BIT) { + assert(dev->desc->config_len == 0); + dev->desc->feature_len = (bit / CHAR_BIT) + 1; + } + + features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); +} + +/* This routine sets the configuration fields for an existing device's + * descriptor. It only works for the last device, but that's OK because that's + * how we use it. */ +static void set_config(struct device *dev, unsigned len, const void *conf) +{ + /* Check we haven't overflowed our single page. */ + if (device_config(dev) + len > devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the config information, and store the length. */ + memcpy(device_config(dev), conf, len); + dev->desc->config_len = len; +} + /* This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. */ static struct device *new_device(const char *name, u16 type, int fd, @@ -1075,14 +1154,6 @@ static struct device *new_device(const char *name, u16 type, int fd, { struct device *dev = malloc(sizeof(*dev)); - /* Append to device list. Prepending to a single-linked list is - * easier, but the user expects the devices to be arranged on the bus - * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ - *devices.lastdev = dev; - dev->next = NULL; - devices.lastdev = &dev->next; - /* Now we populate the fields one at a time. */ dev->fd = fd; /* If we have an input handler for this file descriptor, then we add it @@ -1092,6 +1163,18 @@ static struct device *new_device(const char *name, u16 type, int fd, dev->desc = new_dev_desc(type); dev->handle_input = handle_input; dev->name = name; + dev->vq = NULL; + + /* Append to device list. Prepending to a single-linked list is + * easier, but the user expects the devices to be arranged on the bus + * in command-line order. The first network device on the command line + * is eth0, the first block device /dev/vda, etc. */ + if (devices.lastdev) + devices.lastdev->next = dev; + else + devices.dev = dev; + devices.lastdev = dev; + return dev; } @@ -1216,7 +1299,7 @@ static void setup_tun_net(const char *arg) int netfd, ipfd; u32 ip; const char *br_name = NULL; - u8 hwaddr[6]; + struct virtio_net_config conf; /* We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell @@ -1255,12 +1338,13 @@ static void setup_tun_net(const char *arg) ip = str2ip(arg); /* Set up the tun device, and get the mac address for the interface. */ - configure_device(ipfd, ifr.ifr_name, ip, hwaddr); + configure_device(ipfd, ifr.ifr_name, ip, conf.mac); /* Tell Guest what MAC address to use. */ - add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); + add_feature(dev, VIRTIO_NET_F_MAC); + set_config(dev, sizeof(conf), &conf); - /* We don't seed the socket any more; setup is done. */ + /* We don't need the socket any more; setup is done. */ close(ipfd); verbose("device %u: tun net %u.%u.%u.%u\n", @@ -1342,7 +1426,7 @@ static bool service_io(struct device *dev) if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); in->status = VIRTIO_BLK_S_UNSUPP; - wlen = sizeof(in); + wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { /* Write */ @@ -1363,7 +1447,7 @@ static bool service_io(struct device *dev) /* Die, bad Guest, die. */ errx(1, "Write past end %llu+%u", off, ret); } - wlen = sizeof(in); + wlen = sizeof(*in); in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { /* Read */ @@ -1376,10 +1460,10 @@ static bool service_io(struct device *dev) ret = readv(vblk->fd, iov+1, in_num-1); verbose("READ from sector %llu: %i\n", out->sector, ret); if (ret >= 0) { - wlen = sizeof(in) + ret; + wlen = sizeof(*in) + ret; in->status = VIRTIO_BLK_S_OK; } else { - wlen = sizeof(in); + wlen = sizeof(*in); in->status = VIRTIO_BLK_S_IOERR; } } @@ -1448,8 +1532,7 @@ static void setup_block_file(const char *filename) struct device *dev; struct vblk_info *vblk; void *stack; - u64 cap; - unsigned int val; + struct virtio_blk_config conf; /* This is the pipe the I/O thread will use to tell us I/O is done. */ pipe(p); @@ -1467,14 +1550,18 @@ static void setup_block_file(const char *filename) vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); + /* We support barriers. */ + add_feature(dev, VIRTIO_BLK_F_BARRIER); + /* Tell Guest how many sectors this device has. */ - cap = cpu_to_le64(vblk->len / 512); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); + conf.capacity = cpu_to_le64(vblk->len / 512); /* Tell Guest not to put in too many descriptors at once: two are used * for the in and out elements. */ - val = cpu_to_le32(VIRTQUEUE_NUM - 2); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); + add_feature(dev, VIRTIO_BLK_F_SEG_MAX); + conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); + + set_config(dev, sizeof(conf), &conf); /* The I/O thread writes to this end of the pipe when done. */ vblk->done_fd = p[1]; @@ -1485,7 +1572,9 @@ static void setup_block_file(const char *filename) /* Create stack for thread and run it */ stack = malloc(32768); - if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) + /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from + * becoming a zombie. */ + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ @@ -1493,9 +1582,23 @@ static void setup_block_file(const char *filename) close(vblk->workpipe[0]); verbose("device %u: virtblock %llu sectors\n", - devices.device_num, cap); + devices.device_num, le64_to_cpu(conf.capacity)); +} +/* That's the end of device setup. :*/ + +/* Reboot */ +static void __attribute__((noreturn)) restart_guest(void) +{ + unsigned int i; + + /* Closing pipes causes the waker thread and io_threads to die, and + * closing /dev/lguest cleans up the Guest. Since we don't track all + * open fds, we simply close everything beyond stderr. */ + for (i = 3; i < FD_SETSIZE; i++) + close(i); + execv(main_args[0], main_args); + err(1, "Could not exec %s", main_args[0]); } -/* That's the end of device setup. */ /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */ @@ -1507,7 +1610,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); + readval = pread(lguest_fd, ¬ify_addr, + sizeof(notify_addr), cpu_id); /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { @@ -1517,16 +1621,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; - read(lguest_fd, reason, sizeof(reason)-1); + pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); errx(1, "%s", reason); + /* ERESTART means that we need to reboot the guest */ + } else if (errno == ERESTART) { + restart_guest(); /* EAGAIN means the Waker wanted us to look at some input. * Anything else means a bug or incompatible change. */ } else if (errno != EAGAIN) err(1, "Running guest failed"); + /* Only service input on thread for CPU 0. */ + if (cpu_id != 0) + continue; + /* Service input, then unset the BREAK to release the Waker. */ handle_input(lguest_fd); - if (write(lguest_fd, args, sizeof(args)) < 0) + if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) err(1, "Resetting break"); } } @@ -1567,17 +1678,24 @@ int main(int argc, char *argv[]) /* If they specify an initrd file to load. */ const char *initrd_name = NULL; + /* Save the args: we "reboot" by execing ourselves again. */ + main_args = argv; + /* We don't "wait" for the children, so prevent them from becoming + * zombies. */ + signal(SIGCHLD, SIG_IGN); + /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the - * list. We also keep a pointer to the last device, for easy appending - * to the list. Finally, we keep the next interrupt number to hand out - * (1: remember that 0 is used by the timer). */ + * list. We also keep a pointer to the last device. Finally, we keep + * the next interrupt number to hand out (1: remember that 0 is used by + * the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; - devices.lastdev = &devices.dev; + devices.lastdev = NULL; devices.next_irq = 1; + cpu_id = 0; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount