From e0377e25206328998d036cafddcd00a7c3252e3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Sun, 26 Jun 2011 19:36:46 +0300 Subject: lguest: Do not exit on non-fatal errors Do not exit on some non-fatal errors: - writev() fails in net_output(). The result is a lost packet or packets. - writev() fails in console_output(). The result is partially lost console output. - readv() fails in net_input(). The result is a lost packet or packets. Rather than bringing the guest down, this patch ignores e.g. an allocation failure on the host side. Example: lguest: page allocation failure. order:4, mode:0x4d0 Pid: 4045, comm: lguest Tainted: G W 2.6.36 #1 Call Trace: [] ? printk+0x18/0x1c [] __alloc_pages_nodemask+0x4d2/0x570 [] cache_alloc_refill+0x2a4/0x4d0 [] ? __netif_receive_skb+0x189/0x270 [] __kmalloc+0xda/0xf0 [] __alloc_skb+0x55/0x100 [] ? net_rx_action+0x79/0x100 [] sock_alloc_send_pskb+0x18d/0x280 [] ? _copy_from_user+0x35/0x130 [] ? memcpy_fromiovecend+0x56/0x80 [] tun_chr_aio_write+0x1cc/0x500 [] do_sync_readv_writev+0x95/0xd0 [] ? _copy_from_user+0x35/0x130 [] ? rw_copy_check_uvector+0x58/0x100 [] do_readv_writev+0x9c/0x1d0 [] ? tun_chr_aio_write+0x0/0x500 [] vfs_writev+0x4a/0x60 [] sys_writev+0x41/0x80 [] syscall_call+0x7/0xb Mem-Info: DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 Normal per-cpu: CPU 0: hi: 186, btch: 31 usd: 0 HighMem per-cpu: CPU 0: hi: 186, btch: 31 usd: 0 active_anon:134651 inactive_anon:50543 isolated_anon:0 active_file:96881 inactive_file:132007 isolated_file:0 unevictable:0 dirty:3 writeback:0 unstable:0 free:91374 slab_reclaimable:6300 slab_unreclaimable:2802 mapped:2281 shmem:9 pagetables:330 bounce:0 DMA free:3524kB min:64kB low:80kB high:96kB active_anon:0kB inactive_anon:8kB active_file:8760kB inactive_file:2760kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15868kB mlocked:0kB dirty:0kB writeback:0kB mapped:16kB shmem:0kB slab_reclaimable:88kB slab_unreclaimable:148kB kernel_stack:40kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 865 2016 2016 Normal free:150100kB min:3728kB low:4660kB high:5592kB active_anon:6224kB inactive_anon:15772kB active_file:324084kB inactive_file:325944kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:885944kB mlocked:0kB dirty:12kB writeback:0kB mapped:1520kB shmem:0kB slab_reclaimable:25112kB slab_unreclaimable:11060kB kernel_stack:1888kB pagetables:1320kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 9207 9207 HighMem free:211872kB min:512kB low:1752kB high:2992kB active_anon:532380kB inactive_anon:186392kB active_file:54680kB inactive_file:199324kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:1178504kB mlocked:0kB dirty:0kB writeback:0kB mapped:7588kB shmem:36kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 DMA: 3*4kB 65*8kB 35*16kB 18*32kB 11*64kB 9*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3524kB Normal: 35981*4kB 344*8kB 158*16kB 28*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 150100kB HighMem: 5732*4kB 5462*8kB 2826*16kB 1598*32kB 84*64kB 10*128kB 7*256kB 1*512kB 1*1024kB 1*2048kB 9*4096kB = 211872kB 231237 total pagecache pages 2340 pages in swap cache Swap cache stats: add 160060, delete 157720, find 189017/194106 Free swap = 4179840kB Total swap = 4194300kB 524271 pages RAM 296946 pages HighMem 5668 pages reserved 867664 pages shared 82155 pages non-shared Signed-off-by: Sakari Ailus Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index cd9d6af61d07..e3b9bb7a644a 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -861,8 +861,10 @@ static void console_output(struct virtqueue *vq) /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); + if (len <= 0) { + warn("Write to stdout gave %i (%d)", len, errno); + break; + } iov_consume(iov, out, len); } @@ -898,7 +900,7 @@ static void net_output(struct virtqueue *vq) * same format: what a coincidence! */ if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); + warnx("Write to tun failed (%d)?", errno); /* * Done with that one; wait_for_vq_desc() will send the interrupt if @@ -955,7 +957,7 @@ static void net_input(struct virtqueue *vq) */ len = readv(net_info->tunfd, iov, in); if (len <= 0) - err(1, "Failed to read from tun."); + warn("Failed to read from tun (%d).", errno); /* * Mark that packet buffer as used, but don't interrupt here. We want -- cgit v1.2.3 From 3c3ed482dc077a67903a58c9e1aedba1bb18c18a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:49 +0930 Subject: lguest: Simplify device initialization. We used to notify the Host every time we updated a device's status. However, it only really needs to know when we're resetting the device, or failed to initialize it, or when we've finished our feature negotiation. In particular, we used to wait for VIRTIO_CONFIG_S_DRIVER_OK in the status byte before starting the device service threads. But this corresponds to the successful finish of device initialization, which might (like virtio_blk's partition scanning) use the device. So we had a hack, if they used the device before we expected we started the threads anyway. Now we hook into the finalize_features hook in the Guest: at that point we tell the Launcher that it can rely on the features we have acked. On the Launcher side, we look at the status at that point, and start servicing the device. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 25 ++++++----------------- drivers/lguest/lguest_device.c | 37 +++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 34 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index e3b9bb7a644a..80261d34da3d 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -1095,9 +1095,10 @@ static void update_device_status(struct device *dev) warnx("Device %s configuration FAILED", dev->name); if (dev->running) reset_device(dev); - } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); + } else { + if (dev->running) + err(1, "Device %s features finalized twice", dev->name); + start_device(dev); } } @@ -1122,25 +1123,11 @@ static void handle_output(unsigned long addr) return; } - /* - * Devices *can* be used before status is set to DRIVER_OK. - * The original plan was that they would never do this: they - * would always finish setting up their status bits before - * actually touching the virtqueues. In practice, we allowed - * them to, and they do (eg. the disk probes for partition - * tables as part of initialization). - * - * If we see this, we start the device: once it's running, we - * expect the device to catch all the notifications. - */ + /* Devices should not be used before features are finalized. */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - /* This just calls create_thread() for each virtqueue */ - start_device(i); - return; + errx(1, "Notification on %s before setup!", i->name); } } diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88ea..5289ffa2e500 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -108,6 +108,17 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } +/* + * To notify on reset or feature finalization, we (ab)use the NOTIFY + * hypercall, with the descriptor address of the device. + */ +static void status_notify(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); +} + /* * The virtio core takes the features the Host offers, and copies the ones * supported by the driver into the vdev->features array. Once that's all @@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev) if (test_bit(i, vdev->features)) out_features[i / 8] |= (1 << (i % 8)); } + + /* Tell Host we've finished with this device's feature negotiation */ + status_notify(vdev); } /* Once they've found a field, getting a copy of it is easy. */ @@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev) return to_lgdev(vdev)->desc->status; } -/* - * To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". - */ -static void set_status(struct virtio_device *vdev, u8 status) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - /* We set the status. */ - to_lgdev(vdev)->desc->status = status; - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - static void lg_set_status(struct virtio_device *vdev, u8 status) { BUG_ON(!status); - set_status(vdev, status); + to_lgdev(vdev)->desc->status = status; + + /* Tell Host immediately if we failed. */ + if (status & VIRTIO_CONFIG_S_FAILED) + status_notify(vdev); } static void lg_reset(struct virtio_device *vdev) { - set_status(vdev, 0); + /* 0 status means "reset" */ + to_lgdev(vdev)->desc->status = 0; + status_notify(vdev); } /* -- cgit v1.2.3 From 9f54288def3f92b7805eb6d4b1ddcd73ecf6e889 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:50 +0930 Subject: lguest: update comments Also removes a long-unused #define and an extraneous semicolon. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 12 ++++-------- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 18 +++++++++++------- drivers/lguest/core.c | 2 +- drivers/lguest/interrupts_and_traps.c | 4 ++-- drivers/lguest/lguest_user.c | 17 ++++++++++------- drivers/lguest/page_tables.c | 4 ++-- drivers/lguest/x86/core.c | 10 ++++------ 9 files changed, 50 insertions(+), 39 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index 80261d34da3d..043bd7df3139 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -51,7 +51,7 @@ #include #include "../../../include/linux/lguest_launcher.h" /*L:110 - * We can ignore the 42 include files we need for this program, but I do want + * We can ignore the 43 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -65,7 +65,6 @@ typedef uint16_t u16; typedef uint8_t u8; /*:*/ -#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -1359,7 +1358,7 @@ static void setup_console(void) * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * - * More sopisticated is to use one of the tools developed for project like UML + * More sophisticated is to use one of the tools developed for project like UML * to do networking. * * Faster is to do virtio bonding in kernel. Doing this 1:1 would be @@ -1369,7 +1368,7 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. + * Finally, we could use a virtio network switch in the kernel, ie. vhost. :*/ static u32 str2ip(const char *ipaddr) @@ -2006,10 +2005,7 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* - * We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. - */ + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); /* Ensure that we terminate if a device-servicing child dies. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f2924c413..879fd7d33877 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -61,6 +61,7 @@ hcall(unsigned long call, : "memory"); return call; } +/*:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 719a32c60516..74279907bc1a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -71,7 +71,8 @@ #include #include /* for struct machine_ops */ -/*G:010 Welcome to the Guest! +/*G:010 + * Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the @@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, #endif /*G:036 - * When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. -:*/ + * When lazy mode is turned off, we issue the do-nothing hypercall to + * flush any stored calls, and call the generic helper to reset the + * per-cpu lazy mode variable. + */ static void lguest_leave_lazy_mmu_mode(void) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } +/* + * We also catch the end of context switch; we enter lazy mode for much of + * that too, so again we need to flush here. + * + * (Technically, this is lazy CPU mode, and normally we're in lazy MMU + * mode, but unlike Xen, lguest doesn't care about the difference). + */ static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); @@ -640,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) /* * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. Wetell the Host the toplevel and + * a page into a process' address space. We tell the Host the toplevel and * address this corresponds to. The Guest uses one pagetable per process, so * we need to tell the Host which one we're changing (mm->pgd). */ @@ -1139,7 +1148,7 @@ static struct notifier_block paniced = { static __init char *lguest_memory_setup(void) { /* - *The Linux bootloader header contains an "e820" memory map: the + * The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ e820_add_region(boot_params.e820_map[0].addr, diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index c8c95e575c1e..cfa23e37ec5c 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -6,18 +6,22 @@ #include /*G:020 - * Our story starts with the kernel booting into startup_32 in - * arch/x86/kernel/head_32.S. It expects a boot header, which is created by - * the bootloader (the Launcher in our case). + + * Our story starts with the bzImage: booting starts at startup_32 in + * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real + * kernel in place and then jumps into it: startup_32 in + * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi + * register, which is created by the bootloader (the Launcher in our case). * * The startup_32 function does very little: it clears the uninitialized global * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe. Finally it checks the - * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: - * if it's set to '1' (lguest's assigned number), then it calls us here. + * header and kernel command line somewhere safe, and populates some initial + * page tables. Finally it checks the 'hardware_subarch' field. This was + * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's + * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expectes + * addesses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e37..2535933c49f8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -117,7 +117,7 @@ static __init int map_switcher(void) /* * Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). + * Copy in the compiled-in Switcher code (from x86/switcher_32.S). */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index f0c171506371..28433a155d67 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -427,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu) /* * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the IDT entries to use that - * stack. The IDT entries expect a virtual address, so unlike most addresses + * a different kernel stack, so we can change the guest TSS to use that + * stack. The TSS entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9e..f97e625241ad 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher - * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout and entry point. A read will run the - * Guest until something happens, such as a signal or the Guest doing a NOTIFY - * out to the Launcher. +/*P:200 This contains all the /dev/lguest code, whereby the userspace + * launcher controls and communicates with the Guest. For example, + * the first write will tell us the Guest's memory layout and entry + * point. A read will run the Guest until something happens, such as + * a signal or the Guest doing a NOTIFY out to the Launcher. There is + * also a way for the Launcher to attach eventfds to particular NOTIFY + * values instead of returning from the read() call. :*/ #include #include @@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input) goto free_eventfds; /* - * Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. + * Initialize the Guest's shadow page tables. This allocates + * memory, so can fail. */ err = init_guest_pagetable(lg); if (err) @@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = { .read = read, .llseek = default_llseek, }; +/*:*/ /* * This is a textbook example of a "misc" character device. Populate a "struct diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 00026222bde8..3b62be160a6e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -195,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 +/*M:007 * get_pfn is slow: we could probably try to grab batches of pages here as * an optimization (ie. pre-faulting). :*/ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index ec0cdfc04e78..3b9b810cbf28 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -272,7 +272,7 @@ static int emulate_insn(struct lg_cpu *cpu) unsigned int insnlen = 0, in = 0, shift = 0; /* * The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. + * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -409,7 +409,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again + * return to run the Guest again. */ cond_resched(); return; @@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void) int i; /* - * Most of the i386/switcher.S doesn't care that it's been moved; on + * Most of the x86/switcher_32.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -587,7 +587,7 @@ void __init lguest_arch_host_init(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); } put_online_cpus(); -}; +} /*:*/ void __exit lguest_arch_host_fini(void) @@ -670,8 +670,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) /*:*/ /*L:030 - * lguest_arch_setup_regs() - * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ -- cgit v1.2.3