diff -Nurp pristine-linux-2.6.10-rc3/drivers/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/Makefile --- pristine-linux-2.6.10-rc3/drivers/Makefile 2004-12-03 21:55:13.000000000 +0000 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -30,6 +30,7 @@ obj-y += base/ block/ misc/ net/ medi obj-$(CONFIG_NUBUS) += nubus/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_PPC_PMAC) += macintosh/ +obj-$(CONFIG_ARCH_XEN) += xen/ obj-$(CONFIG_IDE) += ide/ obj-$(CONFIG_FC4) += fc4/ obj-$(CONFIG_SCSI) += scsi/ diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,12 @@ + + +obj-y += console/ +obj-y += evtchn/ +obj-y += balloon/ + +obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += privcmd/ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ +obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/ +obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/ + diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/balloon/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/balloon/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/balloon/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/balloon/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y += balloon.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/balloon/balloon.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/balloon/balloon.c --- pristine-linux-2.6.10-rc3/drivers/xen/balloon/balloon.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/balloon/balloon.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,441 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct proc_dir_entry *balloon_pde; + +static DECLARE_MUTEX(balloon_mutex); +spinlock_t balloon_lock = SPIN_LOCK_UNLOCKED; + +/* We aim for 'current allocation' == 'target allocation'. */ +static unsigned long current_pages; +static unsigned long target_pages; + +/* We may hit the hard limit in Xen. If we do then we remember it. */ +static unsigned long hard_limit; + +/* + * Drivers may alter the memory reservation independently, but they must + * inform the balloon driver so that we can avoid hitting the hard limit. + */ +static unsigned long driver_pages; + +/* List of ballooned pages, threaded through the mem_map array. */ +static LIST_HEAD(ballooned_pages); +static unsigned long balloon_low, balloon_high; + +/* Main work function, always executed in process context. */ +static void balloon_process(void *unused); +static DECLARE_WORK(balloon_worker, balloon_process, NULL); +static struct timer_list balloon_timer; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +/* Use the private and mapping fields of struct page as a list. */ +#define PAGE_TO_LIST(p) ( (struct list_head *)&p->private ) +#define LIST_TO_PAGE(l) ( list_entry( ((unsigned long *)l), \ + struct page, private ) ) +#define UNLIST_PAGE(p) do { list_del(PAGE_TO_LIST(p)); \ + p->mapping = NULL; \ + p->private = 0; } while(0) +#else +/* There's a dedicated list field in struct page we can use. */ +#define PAGE_TO_LIST(p) ( &p->list ) +#define LIST_TO_PAGE(l) ( list_entry(l, struct page, list) ) +#define UNLIST_PAGE(p) ( list_del(&p->list) ) +#define pte_offset_kernel pte_offset +#define subsys_initcall(_fn) __initcall(_fn) +#endif + +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_mem: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_mem: " fmt, ##args) + +/* balloon_append: add the given page to the balloon. */ +static void balloon_append(struct page *page) +{ + /* Low memory is re-populated first, so highmem pages go at list tail. */ + if ( PageHighMem(page) ) + { + list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); + balloon_high++; + } + else + { + list_add(PAGE_TO_LIST(page), &ballooned_pages); + balloon_low++; + } +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static struct page *balloon_retrieve(void) +{ + struct page *page; + + if ( list_empty(&ballooned_pages) ) + return NULL; + + page = LIST_TO_PAGE(ballooned_pages.next); + UNLIST_PAGE(page); + + if ( PageHighMem(page) ) + balloon_high--; + else + balloon_low--; + + return page; +} + +static inline pte_t *get_ptep(unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset_k(addr); + if ( pgd_none(*pgd) || pgd_bad(*pgd) ) BUG(); + + pmd = pmd_offset(pgd, addr); + if ( pmd_none(*pmd) || pmd_bad(*pmd) ) BUG(); + + return pte_offset_kernel(pmd, addr); +} + +static void balloon_alarm(unsigned long unused) +{ + schedule_work(&balloon_worker); +} + +static unsigned long current_target(void) +{ + unsigned long target = min(target_pages, hard_limit); + if ( target > (current_pages + balloon_low + balloon_high) ) + target = current_pages + balloon_low + balloon_high; + return target; +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void balloon_process(void *unused) +{ + unsigned long *mfn_list, pfn, i, flags; + struct page *page; + long credit, debt, rc; + void *v; + + down(&balloon_mutex); + + retry: + mfn_list = NULL; + + if ( (credit = current_target() - current_pages) > 0 ) + { + mfn_list = (unsigned long *)vmalloc(credit * sizeof(*mfn_list)); + if ( mfn_list == NULL ) + goto out; + + balloon_lock(flags); + rc = HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, mfn_list, credit, 0); + balloon_unlock(flags); + if ( rc < credit ) + { + /* We hit the Xen hard limit: reprobe. */ + if ( HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, mfn_list, rc, 0) != rc ) + BUG(); + hard_limit = current_pages + rc - driver_pages; + vfree(mfn_list); + goto retry; + } + + for ( i = 0; i < credit; i++ ) + { + if ( (page = balloon_retrieve()) == NULL ) + BUG(); + + pfn = page - mem_map; + if ( phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY ) + BUG(); + + /* Update P->M and M->P tables. */ + phys_to_machine_mapping[pfn] = mfn_list[i]; + queue_machphys_update(mfn_list[i], pfn); + + /* Link back into the page tables if it's not a highmem page. */ + if ( pfn < max_low_pfn ) + queue_l1_entry_update( + get_ptep((unsigned long)__va(pfn << PAGE_SHIFT)), + (mfn_list[i] << PAGE_SHIFT) | pgprot_val(PAGE_KERNEL)); + + /* Finally, relinquish the memory back to the system allocator. */ + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + + current_pages += credit; + } + else if ( credit < 0 ) + { + debt = -credit; + + mfn_list = (unsigned long *)vmalloc(debt * sizeof(*mfn_list)); + if ( mfn_list == NULL ) + goto out; + + for ( i = 0; i < debt; i++ ) + { + if ( (page = alloc_page(GFP_HIGHUSER)) == NULL ) + { + debt = i; + break; + } + + pfn = page - mem_map; + mfn_list[i] = phys_to_machine_mapping[pfn]; + phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; + + if ( !PageHighMem(page) ) + { + v = phys_to_virt((page - mem_map) << PAGE_SHIFT); + scrub_pages(v, 1); + queue_l1_entry_update(get_ptep((unsigned long)v), 0); + } +#ifdef CONFIG_XEN_SCRUB_PAGES + else + { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } +#endif + + balloon_append(page); + } + + /* Flush updates through and flush the TLB. */ + xen_tlb_flush(); + + if ( HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, mfn_list, debt, 0) != debt ) + BUG(); + + current_pages -= debt; + } + + out: + if ( mfn_list != NULL ) + vfree(mfn_list); + + /* Schedule more work if there is some still to be done. */ + if ( current_target() != current_pages ) + mod_timer(&balloon_timer, jiffies + HZ); + + up(&balloon_mutex); +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +static void set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + hard_limit = ~0UL; + target_pages = target; + schedule_work(&balloon_worker); +} + +static void balloon_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_MEM_REQUEST_SET: + { + mem_request_t *req = (mem_request_t *)&msg->msg[0]; + if ( msg->length != sizeof(mem_request_t) ) + goto parse_error; + set_new_target(req->target); + req->status = 0; + } + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + +static int balloon_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if ( !capable(CAP_SYS_ADMIN) ) + return -EPERM; + + if ( count <= 1 ) + return -EBADMSG; /* runt */ + if ( count > sizeof(memstring) ) + return -EFBIG; /* too long */ + + if ( copy_from_user(memstring, buffer, count) ) + return -EFAULT; + memstring[sizeof(memstring)-1] = '\0'; + + target_bytes = memparse(memstring, &endchar); + set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static int balloon_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + +#define K(_p) ((_p)<<(PAGE_SHIFT-10)) + len = sprintf( + page, + "Current allocation: %8lu kB\n" + "Requested target: %8lu kB\n" + "Low-mem balloon: %8lu kB\n" + "High-mem balloon: %8lu kB\n" + "Xen hard limit: ", + K(current_pages), K(target_pages), K(balloon_low), K(balloon_high)); + + if ( hard_limit != ~0UL ) + len += sprintf( + page + len, + "%8lu kB (inc. %8lu kB driver headroom)\n", + K(hard_limit), K(driver_pages)); + else + len += sprintf( + page + len, + " ??? kB\n"); + + *eof = 1; + return len; +} + +static int __init balloon_init(void) +{ + unsigned long pfn; + struct page *page; + + IPRINTK("Initialising balloon driver.\n"); + + current_pages = min(xen_start_info.nr_pages, max_pfn); + target_pages = current_pages; + balloon_low = 0; + balloon_high = 0; + driver_pages = 0UL; + hard_limit = ~0UL; + + init_timer(&balloon_timer); + balloon_timer.data = 0; + balloon_timer.function = balloon_alarm; + + if ( (balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL ) + { + WPRINTK("Unable to create /proc/xen/balloon.\n"); + return -1; + } + + balloon_pde->read_proc = balloon_read; + balloon_pde->write_proc = balloon_write; + + (void)ctrl_if_register_receiver(CMSG_MEM_REQUEST, balloon_ctrlif_rx, 0); + + /* Initialise the balloon with excess memory space. */ + for ( pfn = xen_start_info.nr_pages; pfn < max_pfn; pfn++ ) + { + page = &mem_map[pfn]; + if ( !PageReserved(page) ) + balloon_append(page); + } + + return 0; +} + +subsys_initcall(balloon_init); + +void balloon_update_driver_allowance(long delta) +{ + unsigned long flags; + balloon_lock(flags); + driver_pages += delta; /* non-atomic update */ + balloon_unlock(flags); +} + +void balloon_put_pages(unsigned long *mfn_list, unsigned long nr_mfns) +{ + unsigned long flags; + + balloon_lock(flags); + if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, + mfn_list, nr_mfns, 0) != nr_mfns ) + BUG(); + current_pages -= nr_mfns; /* non-atomic update */ + balloon_unlock(flags); + + schedule_work(&balloon_worker); +} + +EXPORT_SYMBOL(balloon_update_driver_allowance); +EXPORT_SYMBOL(balloon_put_pages); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := blkback.o control.o interface.o vbd.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/blkback.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/blkback.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/blkback.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/blkback.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,586 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/blkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +static unsigned long mmap_vstart; +#define MMAP_PAGES_PER_REQUEST \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ + ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; +} pending_req_t; + +/* + * We can't allocate pending_req's in order, since they may complete out of + * order. We therefore maintain an allocation ring. This ring also indicates + * when enough work has been passed down -- at that point the allocation ring + * will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +/* NB. We use a different index type to differentiate from shared blk rings. */ +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +static kmem_cache_t *buffer_head_cachep; +#endif + +static int do_block_io_op(blkif_t *blkif, int max_to_do); +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st); + +static void fast_flush_area(int idx, int nr_pages) +{ + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + int i; + + for ( i = 0; i < nr_pages; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping; + mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = 0; + mcl[i].args[2] = 0; + } + + mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB; + if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) + BUG(); +} + + +/****************************************************************** + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE + */ + +static struct list_head blkio_schedule_list; +static spinlock_t blkio_schedule_list_lock; + +static int __on_blkdev_list(blkif_t *blkif) +{ + return blkif->blkdev_list.next != NULL; +} + +static void remove_from_blkdev_list(blkif_t *blkif) +{ + unsigned long flags; + if ( !__on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); +} + +static void add_to_blkdev_list_tail(blkif_t *blkif) +{ + unsigned long flags; + if ( __on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) + { + list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); +} + + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); + +static int blkio_schedule(void *arg) +{ + DECLARE_WAITQUEUE(wq, current); + + blkif_t *blkif; + struct list_head *ent; + + daemonize( +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + "xenblkd" +#endif + ); + + for ( ; ; ) + { + /* Wait for work to do. */ + add_wait_queue(&blkio_schedule_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || + list_empty(&blkio_schedule_list) ) + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&blkio_schedule_wait, &wq); + + /* Queue up a batch of requests. */ + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&blkio_schedule_list) ) + { + ent = blkio_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + /* Push the batch through to disc. */ + run_task_queue(&tq_disk); +#endif + } +} + +static void maybe_trigger_blkio_schedule(void) +{ + /* + * Needed so that two processes, who together make the following predicate + * true, don't both read stale values and evaluate the predicate + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... + */ + smp_mb(); + + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&blkio_schedule_list) ) + wake_up(&blkio_schedule_wait); +} + + + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void __end_block_io_op(pending_req_t *pending_req, int uptodate) +{ + unsigned long flags; + + /* An error fails the entire request. */ + if ( !uptodate ) + { + DPRINTK("Buffer not up-to-date at end of operation\n"); + pending_req->status = BLKIF_RSP_ERROR; + } + + if ( atomic_dec_and_test(&pending_req->pendcnt) ) + { + int pending_idx = pending_req - pending_reqs; + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + maybe_trigger_blkio_schedule(); + } +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +static void end_block_io_op(struct buffer_head *bh, int uptodate) +{ + __end_block_io_op(bh->b_private, uptodate); + kmem_cache_free(buffer_head_cachep, bh); +} +#else +static int end_block_io_op(struct bio *bio, unsigned int done, int error) +{ + if ( done || error ) + __end_block_io_op(bio->bi_private, (done && !error)); + bio_put(bio); + return error; +} +#endif + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_blkio_schedule(); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif, int max_to_do) +{ + blkif_ring_t *blk_ring = blkif->blk_ring_base; + blkif_request_t *req; + BLKIF_RING_IDX i, rp; + int more_to_do = 0; + + rp = blk_ring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /* Take items off the comms ring, taking care not to overflow. */ + for ( i = blkif->blk_req_cons; + (i != rp) && ((i-blkif->blk_resp_prod) != BLKIF_RING_SIZE); + i++ ) + { + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) + { + more_to_do = 1; + break; + } + + req = &blk_ring->ring[MASK_BLKIF_IDX(i)].req; + switch ( req->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + case BLKIF_OP_PROBE: + dispatch_probe(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + blk_ring->ring[i].req.operation); + make_response(blkif, blk_ring->ring[i].req.id, + blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR); + break; + } + } + + blkif->blk_req_cons = i; + return more_to_do; +} + +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) +{ + int rsp = BLKIF_RSP_ERROR; + int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + /* We expect one buffer only. */ + if ( unlikely(req->nr_segments != 1) ) + goto out; + + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || + (blkif_last_sect(req->frame_and_sects[0]) != 7) ) + goto out; + + if ( HYPERVISOR_update_va_mapping_otherdomain( + MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, + (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, + 0, blkif->domid) ) + goto out; + + rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), + PAGE_SIZE / sizeof(vdisk_t)); + + out: + fast_flush_area(pending_idx, 1); + make_response(blkif, req->id, req->operation, rsp); +} + +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + short nr_sects; + unsigned long buffer, fas; + int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + unsigned long remap_prot; + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + + /* We map virtual scatter/gather segments to physical segments. */ + int new_segs, nr_psegs = 0; + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; + + /* Check that number of segments is sane. */ + if ( unlikely(req->nr_segments == 0) || + unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) + { + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); + goto bad_descriptor; + } + + /* + * Check each address/size pair is sane, and convert into a + * physical device and block offset. Note that if the offset and size + * crosses a virtual extent boundary, we may end up with more + * physical scatter/gather segments than virtual segments. + */ + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) + { + fas = req->frame_and_sects[i]; + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; + + if ( nr_sects <= 0 ) + goto bad_descriptor; + + phys_seg[nr_psegs].dev = req->device; + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; + phys_seg[nr_psegs].buffer = buffer; + phys_seg[nr_psegs].nr_sects = nr_sects; + + /* Translate the request into the relevant 'physical device' */ + new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation); + if ( new_segs < 0 ) + { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + req->sector_number + tot_sects, + req->sector_number + tot_sects + nr_sects, + req->device); + goto bad_descriptor; + } + + nr_psegs += new_segs; + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); + } + + /* Nonsensical zero-sized request? */ + if ( unlikely(nr_psegs == 0) ) + goto bad_descriptor; + + if ( operation == READ ) + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; + else + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED; + + for ( i = 0; i < nr_psegs; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; + mcl[i].args[2] = 0; + mcl[i].args[3] = blkif->domid; + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = + FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT); + } + + if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) ) + BUG(); + + for ( i = 0; i < nr_psegs; i++ ) + { + if ( unlikely(mcl[i].args[5] != 0) ) + { + DPRINTK("invalid buffer -- could not remap it\n"); + fast_flush_area(pending_idx, nr_psegs); + goto bad_descriptor; + } + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nr_psegs; + atomic_set(&pending_req->pendcnt, nr_psegs); + pending_cons++; + + blkif_get(blkif); + + /* Now we pass each segment down to the real blkdev layer. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + for ( i = 0; i < nr_psegs; i++ ) + { + struct buffer_head *bh; + + bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC); + if ( unlikely(bh == NULL) ) + { + __end_block_io_op(pending_req, 0); + continue; + } + + memset(bh, 0, sizeof (struct buffer_head)); + + init_waitqueue_head(&bh->b_wait); + bh->b_size = phys_seg[i].nr_sects << 9; + bh->b_dev = phys_seg[i].dev; + bh->b_rdev = phys_seg[i].dev; + bh->b_rsector = (unsigned long)phys_seg[i].sector_number; + bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + + (phys_seg[i].buffer & ~PAGE_MASK); + bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); + bh->b_end_io = end_block_io_op; + bh->b_private = pending_req; + + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | + (1 << BH_Req) | (1 << BH_Launder); + if ( operation == WRITE ) + bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); + + atomic_set(&bh->b_count, 1); + + /* Dispatch a single request. We'll flush it to disc later. */ + generic_make_request(operation, bh); + } +#else + for ( i = 0; i < nr_psegs; i++ ) + { + struct bio *bio; + struct bio_vec *bv; + + bio = bio_alloc(GFP_ATOMIC, 1); + if ( unlikely(bio == NULL) ) + { + __end_block_io_op(pending_req, 0); + continue; + } + + bio->bi_bdev = phys_seg[i].bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = phys_seg[i].sector_number; + bio->bi_rw = operation; + + bv = bio_iovec_idx(bio, 0); + bv->bv_page = virt_to_page(MMAP_VADDR(pending_idx, i)); + bv->bv_len = phys_seg[i].nr_sects << 9; + bv->bv_offset = phys_seg[i].buffer & ~PAGE_MASK; + + bio->bi_size = bv->bv_len; + bio->bi_vcnt++; + + submit_bio(operation, bio); + } +#endif + + return; + + bad_descriptor: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st) +{ + blkif_response_t *resp; + unsigned long flags; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = &blkif->blk_ring_base-> + ring[MASK_BLKIF_IDX(blkif->blk_resp_prod)].resp; + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); /* Ensure other side can see the response fields. */ + blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); +} + +void blkif_deschedule(blkif_t *blkif) +{ + remove_from_blkdev_list(blkif); +} + +static int __init blkif_init(void) +{ + int i; + + if ( !(xen_start_info.flags & SIF_INITDOMAIN) && + !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; + + blkif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&blkio_schedule_list_lock); + INIT_LIST_HEAD(&blkio_schedule_list); + + if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) + BUG(); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + buffer_head_cachep = kmem_cache_create( + "buffer_head_cache", sizeof(struct buffer_head), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); +#endif + + blkif_ctrlif_init(); + + return 0; +} + +__initcall(blkif_init); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/common.h tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/common.h --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/common.h 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/common.h 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,120 @@ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +typedef struct rb_root rb_root_t; +typedef struct rb_node rb_node_t; +#else +struct block_device; +#endif + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned long shmem_frame; + unsigned int evtchn; + int irq; + /* Comms information. */ + blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ + BLKIF_RING_IDX blk_req_cons; /* Request consumer. */ + BLKIF_RING_IDX blk_resp_prod; /* Private version of resp. producer. */ + /* VBDs attached to this interface. */ + rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ + spinlock_t vbd_lock; /* Protects VBD mapping. */ + /* Private fields. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct blkif_st *hash_next; + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; + + struct work_struct work; +} blkif_t; + +void blkif_create(blkif_be_create_t *create); +void blkif_destroy(blkif_be_destroy_t *destroy); +void blkif_connect(blkif_be_connect_t *connect); +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); +void blkif_disconnect_complete(blkif_t *blkif); +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + blkif_disconnect_complete(_b); \ + } while (0) + +/* An entry in a list of xen_extents. */ +typedef struct _blkif_extent_le { + blkif_extent_t extent; /* an individual extent */ + struct _blkif_extent_le *next; /* and a pointer to the next */ + struct block_device *bdev; +} blkif_extent_le_t; + +typedef struct _vbd { + blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_TYPE_xxx */ + blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ + rb_node_t rb; /* for linking into R-B tree lookup struct */ +} vbd_t; + +void vbd_create(blkif_be_vbd_create_t *create); +void vbd_grow(blkif_be_vbd_grow_t *grow); +void vbd_shrink(blkif_be_vbd_shrink_t *shrink); +void vbd_destroy(blkif_be_vbd_destroy_t *delete); +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); +void destroy_all_vbds(blkif_t *blkif); + +/* Describes a [partial] disk extent (part of a block io request) */ +typedef struct { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + unsigned long buffer; + blkif_sector_t sector_number; +} phys_seg_t; + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); + +void blkif_interface_init(void); +void blkif_ctrlif_init(void); + +void blkif_deschedule(blkif_t *blkif); + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/control.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/control.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/control.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/control.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,87 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); + + switch ( msg->subtype ) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_be_create_t) ) + goto parse_error; + blkif_create((blkif_be_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_be_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_CONNECT: + if ( msg->length != sizeof(blkif_be_connect_t) ) + goto parse_error; + blkif_connect((blkif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DISCONNECT: + if ( msg->length != sizeof(blkif_be_disconnect_t) ) + goto parse_error; + if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) + goto parse_error; + vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_DESTROY: + if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) + goto parse_error; + vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_GROW: + if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + goto parse_error; + vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_SHRINK: + if ( msg->length != sizeof(blkif_be_vbd_shrink_t) ) + goto parse_error; + vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void blkif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + blkif_be_driver_status_t st; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; + cmsg.length = sizeof(blkif_be_driver_status_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/interface.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/interface.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/interface.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/interface.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,246 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define VMALLOC_VMADDR(x) ((unsigned long)(x)) +#endif + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static kmem_cache_t *blkif_cachep; +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +static void __blkif_disconnect_complete(void *arg) +{ + blkif_t *blkif = (blkif_t *)arg; + ctrl_msg_t cmsg; + blkif_be_disconnect_t disc; + + /* + * These can't be done in blkif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(blkif->evtchn); + vfree(blkif->blk_ring_base); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; + cmsg.id = blkif->disconnect_rspid; + cmsg.length = sizeof(blkif_be_disconnect_t); + disc.domid = blkif->domid; + disc.blkif_handle = blkif->handle; + disc.status = BLKIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'blkif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( blkif->status != DISCONNECTING ) + BUG(); + blkif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void blkif_disconnect_complete(blkif_t *blkif) +{ + INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif); + schedule_work(&blkif->work); +} + +void blkif_create(blkif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + blkif_t **pblkif, *blkif; + + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) + { + DPRINTK("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->vbd_lock); + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 0); + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTK("Could not create blkif: already exists\n"); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + kmem_cache_free(blkif_cachep, blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + DPRINTK("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_destroy(blkif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) != NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pblkif = blkif->hash_next; + destroy_all_vbds(blkif); + kmem_cache_free(blkif_cachep, blkif); + destroy->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_connect(blkif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->blkif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long shmem_frame = connect->shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", + connect->domid, connect->blkif_handle); + connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), + shmem_frame<status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = BLKIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + if ( blkif->status != DISCONNECTED ) + { + connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + vfree(vma->addr); + return; + } + + blkif->evtchn = evtchn; + blkif->irq = bind_evtchn_to_irq(evtchn); + blkif->shmem_frame = shmem_frame; + blkif->blk_ring_base = (blkif_ring_t *)vma->addr; + blkif->status = CONNECTED; + blkif_get(blkif); + + request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif); + + connect->status = BLKIF_BE_STATUS_OKAY; +} + +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->blkif_handle; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_disconnect attempted for non-existent blkif" + " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); + disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( blkif->status == CONNECTED ) + { + blkif->status = DISCONNECTING; + blkif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + free_irq(blkif->irq, blkif); + blkif_deschedule(blkif); + blkif_put(blkif); + return 0; /* Caller should not send response message. */ + } + + disconnect->status = BLKIF_BE_STATUS_OKAY; + return 1; +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkback/vbd.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/vbd.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkback/vbd.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkback/vbd.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,576 @@ +/****************************************************************************** + * blkback/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * NOTE: vbd_lock protects updates to the rb_tree against concurrent lookups + * in vbd_translate. All other lookups are implicitly protected because the + * only caller (the control message dispatch routine) serializes the calls. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +static dev_t vbd_map_devnum(blkif_pdev_t); +#endif + +void vbd_create(blkif_be_vbd_create_t *create) +{ + vbd_t *vbd; + rb_node_t **rb_p, *rb_parent = NULL; + blkif_t *blkif; + blkif_vdev_t vdevice = create->vdevice; + + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + rb_p = &blkif->vbd_rb.rb_node; + while ( *rb_p != NULL ) + { + rb_parent = *rb_p; + vbd = rb_entry(rb_parent, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + { + rb_p = &rb_parent->rb_left; + } + else if ( vdevice > vbd->vdevice ) + { + rb_p = &rb_parent->rb_right; + } + else + { + DPRINTK("vbd_create attempted for already existing vbd\n"); + create->status = BLKIF_BE_STATUS_VBD_EXISTS; + return; + } + } + + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_create: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + vbd->vdevice = vdevice; + vbd->readonly = create->readonly; + vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; + vbd->extents = NULL; + + spin_lock(&blkif->vbd_lock); + rb_link_node(&vbd->rb, rb_parent, rb_p); + rb_insert_color(&vbd->rb, &blkif->vbd_rb); + spin_unlock(&blkif->vbd_lock); + + DPRINTK("Successful creation of vdev=%04x (dom=%u)\n", + vdevice, create->domid); + create->status = BLKIF_BE_STATUS_OKAY; +} + + +/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ +void vbd_grow(blkif_be_vbd_grow_t *grow) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = grow->vdevice; + unsigned long sz; + + blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", + grow->domid, grow->blkif_handle); + grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); + grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + return; + } + + if ( grow->extent.sector_start > 0 ) + { + DPRINTK("vbd_grow: dev %08x start not zero.\n", grow->extent.device); + grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + return; + } + + if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), + GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_grow: out of memory\n"); + grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + x->extent.device = grow->extent.device; + x->extent.sector_start = grow->extent.sector_start; + x->extent.sector_length = grow->extent.sector_length; + x->next = (blkif_extent_le_t *)NULL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + x->bdev = open_by_devnum(vbd_map_devnum(x->extent.device), + vbd->readonly ? FMODE_READ : FMODE_WRITE); + if ( IS_ERR(x->bdev) ) + { + DPRINTK("vbd_grow: device %08x doesn't exist.\n", x->extent.device); + grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + goto out; + } + /* XXXcl maybe bd_claim? */ + + if ( (x->bdev->bd_disk == NULL) ) + { + DPRINTK("vbd_grow: device %08x doesn't exist.\n", x->extent.device); + grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + blkdev_put(x->bdev); + goto out; + } + + /* get size in sectors */ + if ( x->bdev->bd_part ) + sz = x->bdev->bd_part->nr_sects; + else + sz = x->bdev->bd_disk->capacity; + +#else + if( !blk_size[MAJOR(x->extent.device)] ) + { + DPRINTK("vbd_grow: device %08x doesn't exist.\n", x->extent.device); + grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + goto out; + } + + /* convert blocks (1KB) to sectors */ + sz = blk_size[MAJOR(x->extent.device)][MINOR(x->extent.device)] * 2; + + if ( sz == 0 ) + { + DPRINTK("vbd_grow: device %08x zero size!\n", x->extent.device); + grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + goto out; + } +#endif + + /* + * NB. This test assumes sector_start == 0, which is always the case + * in Xen 1.3. In fact the whole grow/shrink interface could do with + * some simplification. + */ + if ( x->extent.sector_length > sz ) + x->extent.sector_length = sz; + + DPRINTK("vbd_grow: requested_len %llu actual_len %lu\n", + x->extent.sector_length, sz); + + for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) + continue; + + *px = x; /* ATOMIC: no need for vbd_lock. */ + + DPRINTK("Successful grow of vdev=%04x (dom=%u)\n", + vdevice, grow->domid); + + grow->status = BLKIF_BE_STATUS_OKAY; + return; + + out: + kfree(x); +} + + +void vbd_shrink(blkif_be_vbd_shrink_t *shrink) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = shrink->vdevice; + + blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", + shrink->domid, shrink->blkif_handle); + shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + return; + } + + if ( unlikely(vbd->extents == NULL) ) + { + shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + return; + } + + /* Find the last extent. We now know that there is at least one. */ + for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next ) + continue; + + x = *px; + *px = x->next; /* ATOMIC: no need for vbd_lock. */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + blkdev_put(x->bdev); +#endif + kfree(x); + + shrink->status = BLKIF_BE_STATUS_OKAY; +} + + +void vbd_destroy(blkif_be_vbd_destroy_t *destroy) +{ + blkif_t *blkif; + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + blkif_vdev_t vdevice = destroy->vdevice; + + blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", + destroy->domid, destroy->blkif_handle); + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + return; + + found: + spin_lock(&blkif->vbd_lock); + rb_erase(rb, &blkif->vbd_rb); + spin_unlock(&blkif->vbd_lock); + + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + blkdev_put(x->bdev); +#endif + kfree(x); + x = t; + } +} + + +void destroy_all_vbds(blkif_t *blkif) +{ + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + + spin_lock(&blkif->vbd_lock); + + while ( (rb = blkif->vbd_rb.rb_node) != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + + rb_erase(rb, &blkif->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + blkdev_put(x->bdev); +#endif + kfree(x); + x = t; + } + } + + spin_unlock(&blkif->vbd_lock); +} + + +static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd) +{ + blkif_extent_le_t *x; + + vbd_info->device = vbd->vdevice; + vbd_info->info = vbd->type; + if ( vbd->readonly ) + vbd_info->info |= VDISK_FLAG_RO; + vbd_info->capacity = 0ULL; + for ( x = vbd->extents; x != NULL; x = x->next ) + vbd_info->capacity += x->extent.sector_length; + + return 0; +} + + +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) +{ + int rc = 0, nr_vbds = 0; + rb_node_t *rb; + + spin_lock(&blkif->vbd_lock); + + if ( (rb = blkif->vbd_rb.rb_node) == NULL ) + goto out; + + new_subtree: + /* STEP 1. Find least node (it'll be left-most). */ + while ( rb->rb_left != NULL ) + rb = rb->rb_left; + + for ( ; ; ) + { + /* STEP 2. Dealt with left subtree. Now process current node. */ + if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], + rb_entry(rb, vbd_t, rb))) != 0 ) + goto out; + if ( ++nr_vbds == max_vbds ) + goto out; + + /* STEP 3. Process right subtree, if any. */ + if ( rb->rb_right != NULL ) + { + rb = rb->rb_right; + goto new_subtree; + } + + /* STEP 4. Done both subtrees. Head back through ancesstors. */ + for ( ; ; ) + { + /* We're done when we get back to the root node. */ + if ( rb->rb_parent == NULL ) + goto out; + /* If we are left of parent, then parent is next to process. */ + if ( rb->rb_parent->rb_left == rb ) + break; + /* If we are right of parent, then we climb to grandparent. */ + rb = rb->rb_parent; + } + + rb = rb->rb_parent; + } + + out: + spin_unlock(&blkif->vbd_lock); + return (rc == 0) ? nr_vbds : rc; +} + + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation) +{ + blkif_extent_le_t *x; + vbd_t *vbd; + rb_node_t *rb; + blkif_sector_t sec_off; + unsigned long nr_secs; + + /* Take the vbd_lock because another thread could be updating the tree. */ + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( pseg->dev < vbd->vdevice ) + rb = rb->rb_left; + else if ( pseg->dev > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + DPRINTK("vbd_translate; domain %u attempted to access " + "non-existent VBD.\n", blkif->domid); + + spin_unlock(&blkif->vbd_lock); + return -ENODEV; + + found: + + if ( (operation == WRITE) && vbd->readonly ) + { + spin_unlock(&blkif->vbd_lock); + return -EACCES; + } + + /* + * Now iterate through the list of blkif_extents, working out which should + * be used to perform the translation. + */ + sec_off = pseg->sector_number; + nr_secs = pseg->nr_sects; + for ( x = vbd->extents; x != NULL; x = x->next ) + { + if ( sec_off < x->extent.sector_length ) + { + pseg->dev = x->extent.device; + pseg->bdev = x->bdev; + pseg->sector_number = x->extent.sector_start + sec_off; + if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) ) + goto overrun; + spin_unlock(&blkif->vbd_lock); + return 1; + } + sec_off -= x->extent.sector_length; + } + + DPRINTK("vbd_translate: end of vbd.\n"); + spin_unlock(&blkif->vbd_lock); + return -EACCES; + + /* + * Here we deal with overrun onto the following extent. We don't deal with + * overrun of more than one boundary since each request is restricted to + * 2^9 512-byte sectors, so it should be trivial for control software to + * ensure that extents are large enough to prevent excessive overrun. + */ + overrun: + + /* Adjust length of first chunk to run to end of first extent. */ + pseg[0].nr_sects = x->extent.sector_length - sec_off; + + /* Set second chunk buffer and length to start where first chunk ended. */ + pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); + pseg[1].nr_sects = nr_secs - pseg[0].nr_sects; + + /* Now move to the next extent. Check it exists and is long enough! */ + if ( unlikely((x = x->next) == NULL) || + unlikely(x->extent.sector_length < pseg[1].nr_sects) ) + { + DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); + spin_unlock(&blkif->vbd_lock); + return -EACCES; + } + + /* Store the real device and start sector for the second chunk. */ + pseg[1].dev = x->extent.device; + pseg[1].bdev = x->bdev; + pseg[1].sector_number = x->extent.sector_start; + + spin_unlock(&blkif->vbd_lock); + return 2; +} + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + +#define MAJOR_XEN(dev) ((dev)>>8) +#define MINOR_XEN(dev) ((dev) & 0xff) + +#ifndef FANCY_REMAPPING +static dev_t vbd_map_devnum(blkif_pdev_t cookie) +{ + int major = MAJOR_XEN(cookie); + int minor = MINOR_XEN(cookie); + + return MKDEV(major, minor); +} +#else +#define XEN_IDE0_MAJOR IDE0_MAJOR +#define XEN_IDE1_MAJOR IDE1_MAJOR +#define XEN_IDE2_MAJOR IDE2_MAJOR +#define XEN_IDE3_MAJOR IDE3_MAJOR +#define XEN_IDE4_MAJOR IDE4_MAJOR +#define XEN_IDE5_MAJOR IDE5_MAJOR +#define XEN_IDE6_MAJOR IDE6_MAJOR +#define XEN_IDE7_MAJOR IDE7_MAJOR +#define XEN_IDE8_MAJOR IDE8_MAJOR +#define XEN_IDE9_MAJOR IDE9_MAJOR +#define XEN_SCSI_DISK0_MAJOR SCSI_DISK0_MAJOR +#define XEN_SCSI_DISK1_MAJOR SCSI_DISK1_MAJOR +#define XEN_SCSI_DISK2_MAJOR SCSI_DISK2_MAJOR +#define XEN_SCSI_DISK3_MAJOR SCSI_DISK3_MAJOR +#define XEN_SCSI_DISK4_MAJOR SCSI_DISK4_MAJOR +#define XEN_SCSI_DISK5_MAJOR SCSI_DISK5_MAJOR +#define XEN_SCSI_DISK6_MAJOR SCSI_DISK6_MAJOR +#define XEN_SCSI_DISK7_MAJOR SCSI_DISK7_MAJOR +#define XEN_SCSI_CDROM_MAJOR SCSI_CDROM_MAJOR + +static dev_t vbd_map_devnum(blkif_pdev_t cookie) +{ + int new_major; + int major = MAJOR_XEN(cookie); + int minor = MINOR_XEN(cookie); + + switch (major) { + case XEN_IDE0_MAJOR: new_major = IDE0_MAJOR; break; + case XEN_IDE1_MAJOR: new_major = IDE1_MAJOR; break; + case XEN_IDE2_MAJOR: new_major = IDE2_MAJOR; break; + case XEN_IDE3_MAJOR: new_major = IDE3_MAJOR; break; + case XEN_IDE4_MAJOR: new_major = IDE4_MAJOR; break; + case XEN_IDE5_MAJOR: new_major = IDE5_MAJOR; break; + case XEN_IDE6_MAJOR: new_major = IDE6_MAJOR; break; + case XEN_IDE7_MAJOR: new_major = IDE7_MAJOR; break; + case XEN_IDE8_MAJOR: new_major = IDE8_MAJOR; break; + case XEN_IDE9_MAJOR: new_major = IDE9_MAJOR; break; + case XEN_SCSI_DISK0_MAJOR: new_major = SCSI_DISK0_MAJOR; break; + case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR: + new_major = SCSI_DISK1_MAJOR + major - XEN_SCSI_DISK1_MAJOR; + break; + case XEN_SCSI_CDROM_MAJOR: new_major = SCSI_CDROM_MAJOR; break; + default: new_major = 0; break; + } + + return MKDEV(new_major, minor); +} +#endif + +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION_CODE(2,6,0) */ diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkfront/Kconfig tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/Kconfig --- pristine-linux-2.6.10-rc3/drivers/xen/blkfront/Kconfig 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/Kconfig 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,6 @@ + +config XENBLOCK + tristate "Block device driver" + depends on ARCH_XEN + help + Block device driver for Xen diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkfront/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/blkfront/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,3 @@ + +obj-y := blkfront.o vbd.o + diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkfront/blkfront.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/blkfront.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkfront/blkfront.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/blkfront.c 2004-11-29 17:28:05.000000000 +0000 @@ -0,0 +1,1416 @@ +/****************************************************************************** + * blkfront.c + * + * XenLinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#include "block.h" +#else +#include "common.h" +#include +#include +#endif + +#include +#include +#include +#include +#include + +typedef unsigned char byte; /* from linux/ide.h */ + +/* Control whether runtime update of vbds is enabled. */ +#define ENABLE_VBD_UPDATE 1 + +#if ENABLE_VBD_UPDATE +static void vbd_update(void); +#else +static void vbd_update(void){}; +#endif + +#define BLKIF_STATE_CLOSED 0 +#define BLKIF_STATE_DISCONNECTED 1 +#define BLKIF_STATE_CONNECTED 2 + +static char *blkif_state_name[] = { + [BLKIF_STATE_CLOSED] = "closed", + [BLKIF_STATE_DISCONNECTED] = "disconnected", + [BLKIF_STATE_CONNECTED] = "connected", +}; + +static char * blkif_status_name[] = { + [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", + [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", + [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", +}; + +#define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args) + +static int blkif_handle = 0; +static unsigned int blkif_state = BLKIF_STATE_CLOSED; +static unsigned int blkif_evtchn = 0; +static unsigned int blkif_irq = 0; + +static int blkif_control_rsp_valid; +static blkif_response_t blkif_control_rsp; + +static blkif_ring_t *blk_ring = NULL; +static BLKIF_RING_IDX resp_cons; /* Response consumer for comms ring. */ +static BLKIF_RING_IDX req_prod; /* Private request producer. */ + +unsigned long rec_ring_free; +blkif_request_t rec_ring[BLKIF_RING_SIZE]; + +static int recovery = 0; /* "Recovery in progress" flag. Protected + * by the blkif_io_lock */ + +/* We plug the I/O ring if the driver is suspended or if the ring is full. */ +#define BLKIF_RING_FULL (((req_prod - resp_cons) == BLKIF_RING_SIZE) || \ + (blkif_state != BLKIF_STATE_CONNECTED)) + +static void kick_pending_request_queues(void); + +int __init xlblk_init(void); + +void blkif_completion( blkif_request_t *req ); + +static inline int GET_ID_FROM_FREELIST( void ) +{ + unsigned long free = rec_ring_free; + + if ( free > BLKIF_RING_SIZE ) + BUG(); + + rec_ring_free = rec_ring[free].id; + + rec_ring[free].id = 0x0fffffee; /* debug */ + + return free; +} + +static inline void ADD_ID_TO_FREELIST( unsigned long id ) +{ + rec_ring[id].id = rec_ring_free; + rec_ring_free = id; +} + + +/************************ COMMON CODE (inlined) ************************/ + +/* Kernel-specific definitions used in the common code */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define DISABLE_SCATTERGATHER() +#else +static int sg_operation = -1; +#define DISABLE_SCATTERGATHER() (sg_operation = -1) +#endif + +static inline void translate_req_to_pfn(blkif_request_t *xreq, + blkif_request_t *req) +{ + int i; + + xreq->operation = req->operation; + xreq->nr_segments = req->nr_segments; + xreq->device = req->device; + /* preserve id */ + xreq->sector_number = req->sector_number; + + for ( i = 0; i < req->nr_segments; i++ ) + xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]); +} + +static inline void translate_req_to_mfn(blkif_request_t *xreq, + blkif_request_t *req) +{ + int i; + + xreq->operation = req->operation; + xreq->nr_segments = req->nr_segments; + xreq->device = req->device; + xreq->id = req->id; /* copy id (unlike above) */ + xreq->sector_number = req->sector_number; + + for ( i = 0; i < req->nr_segments; i++ ) + xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]); +} + + +static inline void flush_requests(void) +{ + DISABLE_SCATTERGATHER(); + wmb(); /* Ensure that the frontend can see the requests. */ + blk_ring->req_prod = req_prod; + notify_via_evtchn(blkif_evtchn); +} + + + + +/************************** KERNEL VERSION 2.6 **************************/ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + +__initcall(xlblk_init); + +#if ENABLE_VBD_UPDATE +static void vbd_update(void) +{ +} +#endif /* ENABLE_VBD_UPDATE */ + +static void kick_pending_request_queues(void) +{ + + if ( (xlbd_blk_queue != NULL) && + test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) ) + { + blk_start_queue(xlbd_blk_queue); + /* XXXcl call to request_fn should not be needed but + * we get stuck without... needs investigating + */ + xlbd_blk_queue->request_fn(xlbd_blk_queue); + } + +} + + +int blkif_open(struct inode *inode, struct file *filep) +{ + struct gendisk *gd = inode->i_bdev->bd_disk; + struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; + + /* Update of usage count is protected by per-device semaphore. */ + di->mi->usage++; + + return 0; +} + + +int blkif_release(struct inode *inode, struct file *filep) +{ + struct gendisk *gd = inode->i_bdev->bd_disk; + struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; + + /* + * When usage drops to zero it may allow more VBD updates to occur. + * Update of usage count is protected by a per-device semaphore. + */ + if (--di->mi->usage == 0) { + vbd_update(); + } + + return 0; +} + + +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + /* struct gendisk *gd = inode->i_bdev->bd_disk; */ + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { + + case HDIO_GETGEO: + /* return ENOSYS to use defaults */ + return -ENOSYS; + + default: + printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command); + return -ENOSYS; + } + + return 0; +} + +#if 0 +/* check media change: should probably do something here in some cases :-) */ +int blkif_check(kdev_t dev) +{ + DPRINTK("blkif_check\n"); + return 0; +} + +int blkif_revalidate(kdev_t dev) +{ + struct block_device *bd; + struct gendisk *gd; + xen_block_t *disk; + unsigned long capacity; + int i, rc = 0; + + if ( (bd = bdget(dev)) == NULL ) + return -EINVAL; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(dev)) == NULL) || + ((disk = xldev_to_xldisk(dev)) == NULL) || + ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) ) + { + rc = -EINVAL; + goto out; + } + + if ( disk->usage > 1 ) + { + rc = -EBUSY; + goto out; + } + + /* Only reread partition table if VBDs aren't mapped to partitions. */ + if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) + { + for ( i = gd->max_p - 1; i >= 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + + grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity); + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} +#endif + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct request *req) +{ + struct xlbd_disk_info *di = + (struct xlbd_disk_info *)req->rq_disk->private_data; + unsigned long buffer_ma; + blkif_request_t *ring_req; + struct bio *bio; + struct bio_vec *bvec; + int idx, s; + unsigned long id; + unsigned int fsect, lsect; + + if (unlikely(blkif_state != BLKIF_STATE_CONNECTED)) + return 1; + + /* Fill out a communications ring structure. */ + ring_req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req; + id = GET_ID_FROM_FREELIST(); + rec_ring[id].id = (unsigned long) req; + + ring_req->id = id; + ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : + BLKIF_OP_READ; + ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->device = di->xd_device; + + s = 0; + ring_req->nr_segments = 0; + rq_for_each_bio(bio, req) { + bio_for_each_segment(bvec, bio, idx) { + buffer_ma = page_to_phys(bvec->bv_page); + if (unlikely((buffer_ma & ((1<<9)-1)) != 0)) + BUG(); + + fsect = bvec->bv_offset >> 9; + lsect = fsect + (bvec->bv_len >> 9) - 1; + if (unlikely(lsect > 7)) + BUG(); + + ring_req->frame_and_sects[ring_req->nr_segments++] = + buffer_ma | (fsect << 3) | lsect; + s += bvec->bv_len >> 9; + } + } + + req_prod++; + + /* Keep a private copy so we can reissue requests when recovering. */ + translate_req_to_pfn( &rec_ring[id], ring_req); + + return 0; +} + + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +void do_blkif_request(request_queue_t *rq) +{ + struct request *req; + int queued; + + DPRINTK("Entered do_blkif_request\n"); + + queued = 0; + + while ((req = elv_next_request(rq)) != NULL) { + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if ( BLKIF_RING_FULL ) + { + blk_stop_queue(rq); + break; + } + DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n", + req, req->cmd, req->sector, req->current_nr_sectors, + req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read"); + blkdev_dequeue_request(req); + if (blkif_queue_request(req)) { + blk_stop_queue(rq); + break; + } + queued++; + } + + if (queued != 0) + flush_requests(); +} + + +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct request *req; + blkif_response_t *bret; + BLKIF_RING_IDX i, rp; + unsigned long flags; + + spin_lock_irqsave(&blkif_io_lock, flags); + + if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || + unlikely(recovery) ) + { + spin_unlock_irqrestore(&blkif_io_lock, flags); + return IRQ_HANDLED; + } + + rp = blk_ring->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for ( i = resp_cons; i != rp; i++ ) + { + unsigned long id; + bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp; + + id = bret->id; + req = (struct request *)rec_ring[id].id; + + blkif_completion( &rec_ring[id] ); + + ADD_ID_TO_FREELIST(id); /* overwrites req */ + + switch ( bret->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) + DPRINTK("Bad return from blkdev data request: %x\n", + bret->status); + + if ( unlikely(end_that_request_first + (req, + (bret->status == BLKIF_RSP_OKAY), + req->hard_nr_sectors)) ) + BUG(); + end_that_request_last(req); + + break; + case BLKIF_OP_PROBE: + memcpy(&blkif_control_rsp, bret, sizeof(*bret)); + blkif_control_rsp_valid = 1; + break; + default: + BUG(); + } + } + + resp_cons = i; + + kick_pending_request_queues(); + + spin_unlock_irqrestore(&blkif_io_lock, flags); + + return IRQ_HANDLED; +} + +#else +/************************** KERNEL VERSION 2.4 **************************/ + +static kdev_t sg_dev; +static unsigned long sg_next_sect; + +/* + * Request queues with outstanding work, but ring is currently full. + * We need no special lock here, as we always access this with the + * blkif_io_lock held. We only need a small maximum list. + */ +#define MAX_PENDING 8 +static request_queue_t *pending_queues[MAX_PENDING]; +static int nr_pending; + + +#define blkif_io_lock io_request_lock + +/*============================================================================*/ +#if ENABLE_VBD_UPDATE + +/* + * blkif_update_int/update-vbds_task - handle VBD update events. + * Schedule a task for keventd to run, which will update the VBDs and perform + * the corresponding updates to our view of VBD state. + */ +static void update_vbds_task(void *unused) +{ + xlvbd_update_vbds(); +} + +static void vbd_update(void) +{ + static struct tq_struct update_tq; + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); +} + +#endif /* ENABLE_VBD_UPDATE */ +/*============================================================================*/ + + +static void kick_pending_request_queues(void) +{ + /* We kick pending request queues if the ring is reasonably empty. */ + if ( (nr_pending != 0) && + ((req_prod - resp_cons) < (BLKIF_RING_SIZE >> 1)) ) + { + /* Attempt to drain the queue, but bail if the ring becomes full. */ + while ( (nr_pending != 0) && !BLKIF_RING_FULL ) + do_blkif_request(pending_queues[--nr_pending]); + } +} + +int blkif_open(struct inode *inode, struct file *filep) +{ + short xldev = inode->i_rdev; + struct gendisk *gd = get_gendisk(xldev); + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + short minor = MINOR(xldev); + + if ( gd->part[minor].nr_sects == 0 ) + { + /* + * Device either doesn't exist, or has zero capacity; we use a few + * cheesy heuristics to return the relevant error code + */ + if ( (gd->sizes[minor >> gd->minor_shift] != 0) || + ((minor & (gd->max_p - 1)) != 0) ) + { + /* + * We have a real device, but no such partition, or we just have a + * partition number so guess this is the problem. + */ + return -ENXIO; /* no such device or address */ + } + else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE ) + { + /* This is a removable device => assume that media is missing. */ + return -ENOMEDIUM; /* media not present (this is a guess) */ + } + else + { + /* Just go for the general 'no such device' error. */ + return -ENODEV; /* no such device */ + } + } + + /* Update of usage count is protected by per-device semaphore. */ + disk->usage++; + + return 0; +} + + +int blkif_release(struct inode *inode, struct file *filep) +{ + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + + /* + * When usage drops to zero it may allow more VBD updates to occur. + * Update of usage count is protected by a per-device semaphore. + */ + if ( --disk->usage == 0 ) { + vbd_update(); + } + + return 0; +} + + +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + kdev_t dev = inode->i_rdev; + struct hd_geometry *geo = (struct hd_geometry *)argument; + struct gendisk *gd; + struct hd_struct *part; + int i; + unsigned short cylinders; + byte heads, sectors; + + /* NB. No need to check permissions. That is done for us. */ + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long) argument, dev); + + gd = get_gendisk(dev); + part = &gd->part[MINOR(dev)]; + + switch ( command ) + { + case BLKGETSIZE: + DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); + return put_user(part->nr_sects, (unsigned long *) argument); + + case BLKGETSIZE64: + DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64, + (u64)part->nr_sects * 512); + return put_user((u64)part->nr_sects * 512, (u64 *) argument); + + case BLKRRPART: /* re-read partition table */ + DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART); + return blkif_revalidate(dev); + + case BLKSSZGET: + return hardsect_size[MAJOR(dev)][MINOR(dev)]; + + case BLKBSZGET: /* get block size */ + DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET); + break; + + case BLKBSZSET: /* set block size */ + DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET); + break; + + case BLKRASET: /* set read-ahead */ + DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET); + break; + + case BLKRAGET: /* get read-ahead */ + DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET); + break; + + case HDIO_GETGEO: + DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO); + if (!argument) return -EINVAL; + + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + + heads = 0xff; + sectors = 0x3f; + cylinders = part->nr_sects / (heads * sectors); + + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(heads, (byte *)&geo->heads)) return -EFAULT; + if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT; + + return 0; + + case HDIO_GETGEO_BIG: + DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG); + if (!argument) return -EINVAL; + + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + + heads = 0xff; + sectors = 0x3f; + cylinders = part->nr_sects / (heads * sectors); + + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(heads, (byte *)&geo->heads)) return -EFAULT; + if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT; + + return 0; + + case CDROMMULTISESSION: + DPRINTK("FIXME: support multisession CDs later\n"); + for ( i = 0; i < sizeof(struct cdrom_multisession); i++ ) + if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_BUS_NUMBER: + DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif"); + return -ENOSYS; + + default: + printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command); + return -ENOSYS; + } + + return 0; +} + + + +/* check media change: should probably do something here in some cases :-) */ +int blkif_check(kdev_t dev) +{ + DPRINTK("blkif_check\n"); + return 0; +} + +int blkif_revalidate(kdev_t dev) +{ + struct block_device *bd; + struct gendisk *gd; + xl_disk_t *disk; + unsigned long capacity; + int i, rc = 0; + + if ( (bd = bdget(dev)) == NULL ) + return -EINVAL; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(dev)) == NULL) || + ((disk = xldev_to_xldisk(dev)) == NULL) || + ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) ) + { + rc = -EINVAL; + goto out; + } + + if ( disk->usage > 1 ) + { + rc = -EBUSY; + goto out; + } + + /* Only reread partition table if VBDs aren't mapped to partitions. */ + if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) + { + for ( i = gd->max_p - 1; i >= 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + + grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity); + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(unsigned long id, + int operation, + char * buffer, + unsigned long sector_number, + unsigned short nr_sectors, + kdev_t device) +{ + unsigned long buffer_ma = virt_to_bus(buffer); + unsigned long xid; + struct gendisk *gd; + blkif_request_t *req; + struct buffer_head *bh; + unsigned int fsect, lsect; + + fsect = (buffer_ma & ~PAGE_MASK) >> 9; + lsect = fsect + nr_sectors - 1; + + /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ + if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) + BUG(); + if ( lsect > 7 ) + BUG(); + + buffer_ma &= PAGE_MASK; + + if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) + return 1; + + switch ( operation ) + { + + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + gd = get_gendisk(device); + + /* + * Update the sector_number we'll pass down as appropriate; note that + * we could sanity check that resulting sector will be in this + * partition, but this will happen in driver backend anyhow. + */ + sector_number += gd->part[MINOR(device)].start_sect; + + /* + * If this unit doesn't consist of virtual partitions then we clear + * the partn bits from the device number. + */ + if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & + GENHD_FL_VIRT_PARTNS) ) + device &= ~(gd->max_p - 1); + + if ( (sg_operation == operation) && + (sg_dev == device) && + (sg_next_sect == sector_number) ) + { + + req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod-1)].req; + bh = (struct buffer_head *)id; + + bh->b_reqnext = (struct buffer_head *)rec_ring[req->id].id; + + + rec_ring[req->id].id = id; + + req->frame_and_sects[req->nr_segments] = + buffer_ma | (fsect<<3) | lsect; + if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) + sg_next_sect += nr_sectors; + else + DISABLE_SCATTERGATHER(); + + /* Update the copy of the request in the recovery ring. */ + translate_req_to_pfn(&rec_ring[req->id], req ); + + return 0; + } + else if ( BLKIF_RING_FULL ) + { + return 1; + } + else + { + sg_operation = operation; + sg_dev = device; + sg_next_sect = sector_number + nr_sectors; + } + break; + + default: + panic("unknown op %d\n", operation); + } + + /* Fill out a communications ring structure. */ + req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req; + + xid = GET_ID_FROM_FREELIST(); + rec_ring[xid].id = id; + + req->id = xid; + req->operation = operation; + req->sector_number = (blkif_sector_t)sector_number; + req->device = device; + req->nr_segments = 1; + req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; + + req_prod++; + + /* Keep a private copy so we can reissue requests when recovering. */ + translate_req_to_pfn(&rec_ring[xid], req ); + + return 0; +} + + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +void do_blkif_request(request_queue_t *rq) +{ + struct request *req; + struct buffer_head *bh, *next_bh; + int rw, nsect, full, queued = 0; + + DPRINTK("Entered do_blkif_request\n"); + + while ( !rq->plugged && !list_empty(&rq->queue_head)) + { + if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) + goto out; + + DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", + req, req->cmd, req->sector, + req->current_nr_sectors, req->nr_sectors, req->bh); + + rw = req->cmd; + if ( rw == READA ) + rw = READ; + if ( unlikely((rw != READ) && (rw != WRITE)) ) + panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw); + + req->errors = 0; + + bh = req->bh; + while ( bh != NULL ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + + full = blkif_queue_request( + (unsigned long)bh, + (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, + bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); + + if ( full ) + { + bh->b_reqnext = next_bh; + pending_queues[nr_pending++] = rq; + if ( unlikely(nr_pending >= MAX_PENDING) ) + BUG(); + goto out; + } + + queued++; + + /* Dequeue the buffer head from the request. */ + nsect = bh->b_size >> 9; + bh = req->bh = next_bh; + + if ( bh != NULL ) + { + /* There's another buffer head to do. Update the request. */ + req->hard_sector += nsect; + req->hard_nr_sectors -= nsect; + req->sector = req->hard_sector; + req->nr_sectors = req->hard_nr_sectors; + req->current_nr_sectors = bh->b_size >> 9; + req->buffer = bh->b_data; + } + else + { + /* That was the last buffer head. Finalise the request. */ + if ( unlikely(end_that_request_first(req, 1, "XenBlk")) ) + BUG(); + blkdev_dequeue_request(req); + end_that_request_last(req); + } + } + } + + out: + if ( queued != 0 ) + flush_requests(); +} + + +static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + BLKIF_RING_IDX i, rp; + unsigned long flags; + struct buffer_head *bh, *next_bh; + + spin_lock_irqsave(&io_request_lock, flags); + + if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) ) + { + spin_unlock_irqrestore(&io_request_lock, flags); + return; + } + + rp = blk_ring->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for ( i = resp_cons; i != rp; i++ ) + { + unsigned long id; + blkif_response_t *bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp; + + id = bret->id; + bh = (struct buffer_head *)rec_ring[id].id; + + blkif_completion( &rec_ring[id] ); + + ADD_ID_TO_FREELIST(id); + + switch ( bret->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) + DPRINTK("Bad return from blkdev data request: %lx\n", + bret->status); + for ( ; bh != NULL; bh = next_bh ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY); + } + + break; + case BLKIF_OP_PROBE: + memcpy(&blkif_control_rsp, bret, sizeof(*bret)); + blkif_control_rsp_valid = 1; + break; + default: + BUG(); + } + } + + resp_cons = i; + + kick_pending_request_queues(); + + spin_unlock_irqrestore(&io_request_lock, flags); +} + +#endif + +/***************************** COMMON CODE *******************************/ + + +void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) +{ + unsigned long flags, id; + + retry: + while ( (req_prod - resp_cons) == BLKIF_RING_SIZE ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + spin_lock_irqsave(&blkif_io_lock, flags); + if ( (req_prod - resp_cons) == BLKIF_RING_SIZE ) + { + spin_unlock_irqrestore(&blkif_io_lock, flags); + goto retry; + } + + DISABLE_SCATTERGATHER(); + blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req = *req; + + id = GET_ID_FROM_FREELIST(); + blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req.id = id; + rec_ring[id].id = (unsigned long) req; + + translate_req_to_pfn( &rec_ring[id], req ); + + req_prod++; + flush_requests(); + + spin_unlock_irqrestore(&blkif_io_lock, flags); + + while ( !blkif_control_rsp_valid ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); + blkif_control_rsp_valid = 0; +} + + +/* Send a driver status notification to the domain controller. */ +static void send_driver_status(int ok) +{ + ctrl_msg_t cmsg = { + .type = CMSG_BLKIF_FE, + .subtype = CMSG_BLKIF_FE_DRIVER_STATUS, + .length = sizeof(blkif_fe_driver_status_t), + }; + blkif_fe_driver_status_t *msg = (void*)cmsg.msg; + + msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +/* Tell the controller to bring up the interface. */ +static void blkif_send_interface_connect(void) +{ + ctrl_msg_t cmsg = { + .type = CMSG_BLKIF_FE, + .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, + .length = sizeof(blkif_fe_interface_connect_t), + }; + blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; + + msg->handle = 0; + msg->shmem_frame = (virt_to_machine(blk_ring) >> PAGE_SHIFT); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +static void blkif_free(void) +{ + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&blkif_io_lock); + recovery = 1; + blkif_state = BLKIF_STATE_DISCONNECTED; + spin_unlock_irq(&blkif_io_lock); + + /* Free resources associated with old device channel. */ + if ( blk_ring != NULL ) + { + free_page((unsigned long)blk_ring); + blk_ring = NULL; + } + free_irq(blkif_irq, NULL); + blkif_irq = 0; + + unbind_evtchn_from_irq(blkif_evtchn); + blkif_evtchn = 0; +} + +static void blkif_close(void) +{ +} + +/* Move from CLOSED to DISCONNECTED state. */ +static void blkif_disconnect(void) +{ + if ( blk_ring != NULL ) + free_page((unsigned long)blk_ring); + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + blkif_state = BLKIF_STATE_DISCONNECTED; + blkif_send_interface_connect(); +} + +static void blkif_reset(void) +{ + blkif_free(); + blkif_disconnect(); +} + +static void blkif_recover(void) +{ + int i; + + /* Hmm, requests might be re-ordered when we re-issue them. + * This will need to be fixed once we have barriers */ + + /* Stage 1 : Find active and move to safety. */ + for ( i = 0; i < BLKIF_RING_SIZE; i++ ) + { + if ( rec_ring[i].id >= PAGE_OFFSET ) + { + translate_req_to_mfn( + &blk_ring->ring[req_prod].req, &rec_ring[i]); + req_prod++; + } + } + + /* Stage 2 : Set up shadow list. */ + for ( i = 0; i < req_prod; i++ ) + { + rec_ring[i].id = blk_ring->ring[i].req.id; + blk_ring->ring[i].req.id = i; + translate_req_to_pfn(&rec_ring[i], &blk_ring->ring[i].req); + } + + /* Stage 3 : Set up free list. */ + for ( ; i < BLKIF_RING_SIZE; i++ ) + rec_ring[i].id = i+1; + rec_ring_free = req_prod; + rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff; + + /* blk_ring->req_prod will be set when we flush_requests().*/ + wmb(); + + /* Switch off recovery mode, using a memory barrier to ensure that + * it's seen before we flush requests - we don't want to miss any + * interrupts. */ + recovery = 0; + wmb(); + + /* Kicks things back into life. */ + flush_requests(); + + /* Now safe to left other peope use interface. */ + blkif_state = BLKIF_STATE_CONNECTED; +} + +static void blkif_connect(blkif_fe_interface_status_t *status) +{ + int err = 0; + + blkif_evtchn = status->evtchn; + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); + + err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL); + if ( err ) + { + printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err); + return; + } + + if ( recovery ) + { + blkif_recover(); + } + else + { + /* Transition to connected in case we need to do + * a partition probe on a whole disk. */ + blkif_state = BLKIF_STATE_CONNECTED; + + /* Probe for discs attached to the interface. */ + xlvbd_init(); + } + + /* Kick pending requests. */ + spin_lock_irq(&blkif_io_lock); + kick_pending_request_queues(); + spin_unlock_irq(&blkif_io_lock); +} + +static void unexpected(blkif_fe_interface_status_t *status) +{ + DPRINTK(" Unexpected blkif status %s in state %s\n", + blkif_status_name[status->status], + blkif_state_name[blkif_state]); +} + +static void blkif_status(blkif_fe_interface_status_t *status) +{ + if ( status->handle != blkif_handle ) + { + WPRINTK(" Invalid blkif: handle=%u", status->handle); + return; + } + + switch ( status->status ) + { + case BLKIF_INTERFACE_STATUS_CLOSED: + switch ( blkif_state ) + { + case BLKIF_STATE_CLOSED: + unexpected(status); + break; + case BLKIF_STATE_DISCONNECTED: + case BLKIF_STATE_CONNECTED: + unexpected(status); + blkif_close(); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_DISCONNECTED: + switch ( blkif_state ) + { + case BLKIF_STATE_CLOSED: + blkif_disconnect(); + break; + case BLKIF_STATE_DISCONNECTED: + case BLKIF_STATE_CONNECTED: + /* unexpected(status); */ /* occurs during suspend/resume */ + blkif_reset(); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_CONNECTED: + switch ( blkif_state ) + { + case BLKIF_STATE_CLOSED: + unexpected(status); + blkif_disconnect(); + blkif_connect(status); + break; + case BLKIF_STATE_DISCONNECTED: + blkif_connect(status); + break; + case BLKIF_STATE_CONNECTED: + unexpected(status); + blkif_connect(status); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_CHANGED: + switch ( blkif_state ) + { + case BLKIF_STATE_CLOSED: + case BLKIF_STATE_DISCONNECTED: + unexpected(status); + break; + case BLKIF_STATE_CONNECTED: + vbd_update(); + break; + } + break; + + default: + WPRINTK(" Invalid blkif status: %d\n", status->status); + break; + } +} + + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_BLKIF_FE_INTERFACE_STATUS: + if ( msg->length != sizeof(blkif_fe_interface_status_t) ) + goto parse_error; + blkif_status((blkif_fe_interface_status_t *) + &msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + +int wait_for_blkif(void) +{ + int err = 0; + int i; + send_driver_status(1); + + /* + * We should read 'nr_interfaces' from response message and wait + * for notifications before proceeding. For now we assume that we + * will be notified of exactly one interface. + */ + for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + if ( blkif_state != BLKIF_STATE_CONNECTED ) + { + printk(KERN_INFO "xen_blk: Timeout connecting to device!\n"); + err = -ENOSYS; + } + return err; +} + +int __init xlblk_init(void) +{ + int i; + + if ( (xen_start_info.flags & SIF_INITDOMAIN) || + (xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; + + printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n"); + + rec_ring_free = 0; + for ( i = 0; i < BLKIF_RING_SIZE; i++ ) + rec_ring[i].id = i+1; + rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + wait_for_blkif(); + + return 0; +} + +void blkdev_suspend(void) +{ +} + +void blkdev_resume(void) +{ + send_driver_status(1); +} + +/* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */ + +void blkif_completion(blkif_request_t *req) +{ + int i; + + switch ( req->operation ) + { + case BLKIF_OP_READ: + for ( i = 0; i < req->nr_segments; i++ ) + { + unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT; + unsigned long mfn = phys_to_machine_mapping[pfn]; + xen_machphys_update(mfn, pfn); + } + break; + } + +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkfront/block.h tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/block.h --- pristine-linux-2.6.10-rc3/drivers/xen/blkfront/block.h 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/block.h 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,113 @@ +/****************************************************************************** + * block.h + * + * Shared definitions between all levels of XenLinux Virtual block devices. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK(_f, _a...) ((void)0) +#endif + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct xlbd_type_info { + int partn_shift; + int partn_per_major; + int devs_per_major; + int hardsect_size; + int max_sectors; + char *name; +}; + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct xlbd_major_info { + int major; + int index; + int usage; + struct xlbd_type_info *type; +}; + +struct xlbd_disk_info { + int xd_device; + struct xlbd_major_info *mi; +}; + +typedef struct xen_block { + int usage; +} xen_block_t; + +extern struct request_queue *xlbd_blk_queue; +extern spinlock_t blkif_io_lock; + +extern int blkif_open(struct inode *inode, struct file *filep); +extern int blkif_release(struct inode *inode, struct file *filep); +extern int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument); +extern int blkif_check(dev_t dev); +extern int blkif_revalidate(dev_t dev); +extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); +extern void do_blkif_request (request_queue_t *rq); + +extern void xlvbd_update_vbds(void); + +/* Virtual block-device subsystem. */ +extern int xlvbd_init(void); +extern void xlvbd_cleanup(void); + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/blkfront/vbd.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/vbd.c --- pristine-linux-2.6.10-rc3/drivers/xen/blkfront/vbd.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/blkfront/vbd.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,561 @@ +/****************************************************************************** + * vbd.c + * + * XenLinux virtual block-device driver (xvd). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "block.h" +#include + +/* + * For convenience we distinguish between ide, scsi and 'other' (i.e. + * potentially combinations of the two) in the naming scheme and in a few + * other places (like default readahead, etc). + */ + +#define NUM_IDE_MAJORS 10 +#define NUM_SCSI_MAJORS 9 +#define NUM_VBD_MAJORS 1 + +static struct xlbd_type_info xlbd_ide_type = { + .partn_shift = 6, + .partn_per_major = 2, + // XXXcl todo blksize_size[major] = 1024; + .hardsect_size = 512, + .max_sectors = 128, /* 'hwif->rqsize' if we knew it */ + // XXXcl todo read_ahead[major] = 8; /* from drivers/ide/ide-probe.c */ + .name = "hd", +}; + +static struct xlbd_type_info xlbd_scsi_type = { + .partn_shift = 4, + .partn_per_major = 16, + // XXXcl todo blksize_size[major] = 1024; /* XXX 512; */ + .hardsect_size = 512, + .max_sectors = 128*8, /* XXX 128; */ + // XXXcl todo read_ahead[major] = 0; /* XXX 8; -- guessing */ + .name = "sd", +}; + +static struct xlbd_type_info xlbd_vbd_type = { + .partn_shift = 4, + .partn_per_major = 16, + // XXXcl todo blksize_size[major] = 512; + .hardsect_size = 512, + .max_sectors = 128, + // XXXcl todo read_ahead[major] = 8; + .name = "xvd", +}; + +/* XXXcl handle cciss after finding out why it's "hacked" in */ + +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + + NUM_VBD_MAJORS]; + +/* Information about our VBDs. */ +#define MAX_VBDS 64 +static int nr_vbds; +static vdisk_t *vbd_info; + +struct request_queue *xlbd_blk_queue = NULL; + +#define MAJOR_XEN(dev) ((dev)>>8) +#define MINOR_XEN(dev) ((dev) & 0xff) + +static struct block_device_operations xlvbd_block_fops = +{ + .owner = THIS_MODULE, + .open = blkif_open, + .release = blkif_release, + .ioctl = blkif_ioctl, +#if 0 + check_media_change: blkif_check, + revalidate: blkif_revalidate, +#endif +}; + +spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED; + +static int xlvbd_get_vbd_info(vdisk_t *disk_info) +{ + vdisk_t *buf = (vdisk_t *)__get_free_page(GFP_KERNEL); + blkif_request_t req; + blkif_response_t rsp; + int nr; + + memset(&req, 0, sizeof(req)); + req.operation = BLKIF_OP_PROBE; + req.nr_segments = 1; + req.frame_and_sects[0] = virt_to_machine(buf) | 7; + + blkif_control_send(&req, &rsp); + + if ( rsp.status <= 0 ) + { + printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status); + return -1; + } + + if ( (nr = rsp.status) > MAX_VBDS ) + nr = MAX_VBDS; + memcpy(disk_info, buf, nr * sizeof(vdisk_t)); + + free_page((unsigned long)buf); + + return nr; +} + +static struct xlbd_major_info *xlbd_get_major_info(int xd_device, int *minor) +{ + int mi_idx, new_major; + int xd_major = MAJOR_XEN(xd_device); + int xd_minor = MINOR_XEN(xd_device); + + *minor = xd_minor; + + switch (xd_major) { + case IDE0_MAJOR: mi_idx = 0; new_major = IDE0_MAJOR; break; + case IDE1_MAJOR: mi_idx = 1; new_major = IDE1_MAJOR; break; + case IDE2_MAJOR: mi_idx = 2; new_major = IDE2_MAJOR; break; + case IDE3_MAJOR: mi_idx = 3; new_major = IDE3_MAJOR; break; + case IDE4_MAJOR: mi_idx = 4; new_major = IDE4_MAJOR; break; + case IDE5_MAJOR: mi_idx = 5; new_major = IDE5_MAJOR; break; + case IDE6_MAJOR: mi_idx = 6; new_major = IDE6_MAJOR; break; + case IDE7_MAJOR: mi_idx = 7; new_major = IDE7_MAJOR; break; + case IDE8_MAJOR: mi_idx = 8; new_major = IDE8_MAJOR; break; + case IDE9_MAJOR: mi_idx = 9; new_major = IDE9_MAJOR; break; + case SCSI_DISK0_MAJOR: mi_idx = 10; new_major = SCSI_DISK0_MAJOR; break; + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: + mi_idx = 11 + xd_major - SCSI_DISK1_MAJOR; + new_major = SCSI_DISK1_MAJOR + xd_major - SCSI_DISK1_MAJOR; + break; + case SCSI_CDROM_MAJOR: mi_idx = 18; new_major = SCSI_CDROM_MAJOR; break; + default: mi_idx = 19; new_major = 0;/* XXXcl notyet */ break; + } + + if (major_info[mi_idx]) + return major_info[mi_idx]; + + major_info[mi_idx] = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); + if (major_info[mi_idx] == NULL) + return NULL; + + memset(major_info[mi_idx], 0, sizeof(struct xlbd_major_info)); + + switch (mi_idx) { + case 0 ... (NUM_IDE_MAJORS - 1): + major_info[mi_idx]->type = &xlbd_ide_type; + major_info[mi_idx]->index = mi_idx; + break; + case NUM_IDE_MAJORS ... (NUM_IDE_MAJORS + NUM_SCSI_MAJORS - 1): + major_info[mi_idx]->type = &xlbd_scsi_type; + major_info[mi_idx]->index = mi_idx - NUM_IDE_MAJORS; + break; + case (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) ... + (NUM_IDE_MAJORS + NUM_SCSI_MAJORS + NUM_VBD_MAJORS - 1): + major_info[mi_idx]->type = &xlbd_vbd_type; + major_info[mi_idx]->index = mi_idx - + (NUM_IDE_MAJORS + NUM_SCSI_MAJORS); + break; + } + major_info[mi_idx]->major = new_major; + + if (register_blkdev(major_info[mi_idx]->major, major_info[mi_idx]->type->name)) { + printk(KERN_ALERT "XL VBD: can't get major %d with name %s\n", + major_info[mi_idx]->major, major_info[mi_idx]->type->name); + goto out; + } + + devfs_mk_dir(major_info[mi_idx]->type->name); + + return major_info[mi_idx]; + + out: + kfree(major_info[mi_idx]); + major_info[mi_idx] = NULL; + return NULL; +} + +static struct gendisk *xlvbd_get_gendisk(struct xlbd_major_info *mi, + int xd_minor, vdisk_t *xd) +{ + struct gendisk *gd; + struct xlbd_disk_info *di; + int device, partno; + + device = MKDEV(mi->major, xd_minor); + gd = get_gendisk(device, &partno); + if (gd) + return gd; + + di = kmalloc(sizeof(struct xlbd_disk_info), GFP_KERNEL); + if (di == NULL) + return NULL; + di->mi = mi; + di->xd_device = xd->device; + + /* Construct an appropriate gendisk structure. */ + gd = alloc_disk(1); + if (gd == NULL) + goto out; + + gd->major = mi->major; + gd->first_minor = xd_minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = di; + sprintf(gd->disk_name, "%s%c%d", mi->type->name, + 'a' + mi->index * mi->type->partn_per_major + + (xd_minor >> mi->type->partn_shift), + xd_minor & ((1 << mi->type->partn_shift) - 1)); + /* sprintf(gd->devfs_name, "%s%s/disc%d", mi->type->name, , ); XXXdevfs */ + + set_capacity(gd, xd->capacity); + + if (xlbd_blk_queue == NULL) { + xlbd_blk_queue = blk_init_queue(do_blkif_request, + &blkif_io_lock); + if (xlbd_blk_queue == NULL) + goto out; + elevator_init(xlbd_blk_queue, "noop"); + + /* + * Turn off barking 'headactive' mode. We dequeue + * buffer heads as soon as we pass them to back-end + * driver. + */ + blk_queue_headactive(xlbd_blk_queue, 0); /* XXXcl: noop according to blkdev.h */ + + blk_queue_hardsect_size(xlbd_blk_queue, + mi->type->hardsect_size); + blk_queue_max_sectors(xlbd_blk_queue, mi->type->max_sectors); /* 'hwif->rqsize' if we knew it */ + + /* XXXcl: set mask to PAGE_SIZE for now, to improve either use + - blk_queue_merge_bvec to merge requests with adjacent ma's + - the tags infrastructure + - the dma infrastructure + */ + blk_queue_segment_boundary(xlbd_blk_queue, PAGE_SIZE - 1); + + blk_queue_max_phys_segments(xlbd_blk_queue, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(xlbd_blk_queue, + BLKIF_MAX_SEGMENTS_PER_REQUEST); /* XXXcl not needed? */ + + + } + gd->queue = xlbd_blk_queue; + + add_disk(gd); + + return gd; + + out: + if (gd) + del_gendisk(gd); + kfree(di); + return NULL; +} + +/* + * xlvbd_init_device - initialise a VBD device + * @disk: a vdisk_t describing the VBD + * + * Takes a vdisk_t * that describes a VBD the domain has access to. + * Performs appropriate initialisation and registration of the device. + * + * Care needs to be taken when making re-entrant calls to ensure that + * corruption does not occur. Also, devices that are in use should not have + * their details updated. This is the caller's responsibility. + */ +static int xlvbd_init_device(vdisk_t *xd) +{ + struct block_device *bd; + struct gendisk *gd; + struct xlbd_major_info *mi; + int device; + int minor; + + int err = -ENOMEM; + + mi = xlbd_get_major_info(xd->device, &minor); + if (mi == NULL) + return -EPERM; + + device = MKDEV(mi->major, minor); + + if ((bd = bdget(device)) == NULL) + return -EPERM; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + gd = xlvbd_get_gendisk(mi, minor, xd); + if (mi == NULL) { + err = -EPERM; + goto out; + } + + if (VDISK_READONLY(xd->info)) + set_disk_ro(gd, 1); + + /* Some final fix-ups depending on the device type */ + switch (VDISK_TYPE(xd->info)) { + case VDISK_TYPE_CDROM: + gd->flags |= GENHD_FL_REMOVABLE | GENHD_FL_CD; + /* FALLTHROUGH */ + case VDISK_TYPE_FLOPPY: + case VDISK_TYPE_TAPE: + gd->flags |= GENHD_FL_REMOVABLE; + break; + + case VDISK_TYPE_DISK: + break; + + default: + printk(KERN_ALERT "XenLinux: unknown device type %d\n", + VDISK_TYPE(xd->info)); + break; + } + + err = 0; + out: + up(&bd->bd_sem); + bdput(bd); + return err; +} + +#if 0 +/* + * xlvbd_remove_device - remove a device node if possible + * @device: numeric device ID + * + * Updates the gendisk structure and invalidates devices. + * + * This is OK for now but in future, should perhaps consider where this should + * deallocate gendisks / unregister devices. + */ +static int xlvbd_remove_device(int device) +{ + int i, rc = 0, minor = MINOR(device); + struct gendisk *gd; + struct block_device *bd; + xen_block_t *disk = NULL; + + if ( (bd = bdget(device)) == NULL ) + return -1; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(device)) == NULL) || + ((disk = xldev_to_xldisk(device)) == NULL) ) + BUG(); + + if ( disk->usage != 0 ) + { + printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device); + rc = -1; + goto out; + } + + if ( (minor & (gd->max_p-1)) != 0 ) + { + /* 1: The VBD is mapped to a partition rather than a whole unit. */ + invalidate_device(device, 1); + gd->part[minor].start_sect = 0; + gd->part[minor].nr_sects = 0; + gd->sizes[minor] = 0; + + /* Clear the consists-of-virtual-partitions flag if possible. */ + gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS; + for ( i = 1; i < gd->max_p; i++ ) + if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 ) + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; + + /* + * If all virtual partitions are now gone, and a 'whole unit' VBD is + * present, then we can try to grok the unit's real partition table. + */ + if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && + (gd->sizes[minor & ~(gd->max_p-1)] != 0) && + !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) ) + { + register_disk(gd, + device&~(gd->max_p-1), + gd->max_p, + &xlvbd_block_fops, + gd->part[minor&~(gd->max_p-1)].nr_sects); + } + } + else + { + /* + * 2: The VBD is mapped to an entire 'unit'. Clear all partitions. + * NB. The partition entries are only cleared if there are no VBDs + * mapped to individual partitions on this unit. + */ + i = gd->max_p - 1; /* Default: clear subpartitions as well. */ + if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) + i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */ + while ( i >= 0 ) + { + invalidate_device(device+i, 1); + gd->part[minor+i].start_sect = 0; + gd->part[minor+i].nr_sects = 0; + gd->sizes[minor+i] = 0; + i--; + } + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + +/* + * xlvbd_update_vbds - reprobes the VBD status and performs updates driver + * state. The VBDs need to be updated in this way when the domain is + * initialised and also each time we receive an XLBLK_UPDATE event. + */ +void xlvbd_update_vbds(void) +{ + int i, j, k, old_nr, new_nr; + vdisk_t *old_info, *new_info, *merged_info; + + old_info = vbd_info; + old_nr = nr_vbds; + + new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); + if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 ) + { + kfree(new_info); + return; + } + + /* + * Final list maximum size is old list + new list. This occurs only when + * old list and new list do not overlap at all, and we cannot yet destroy + * VBDs in the old list because the usage counts are busy. + */ + merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL); + + /* @i tracks old list; @j tracks new list; @k tracks merged list. */ + i = j = k = 0; + + while ( (i < old_nr) && (j < new_nr) ) + { + if ( old_info[i].device < new_info[j].device ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + i++; + } + else if ( old_info[i].device > new_info[j].device ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + j++; + } + else + { + if ( ((old_info[i].capacity == new_info[j].capacity) && + (old_info[i].info == new_info[j].info)) || + (xlvbd_remove_device(old_info[i].device) != 0) ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + else if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + i++; j++; + } + } + + for ( ; i < old_nr; i++ ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + } + + for ( ; j < new_nr; j++ ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + } + + vbd_info = merged_info; + nr_vbds = k; + + kfree(old_info); + kfree(new_info); +} +#endif + +/* + * Set up all the linux device goop for the virtual block devices + * (vbd's) that we know about. Note that although from the backend + * driver's p.o.v. VBDs are addressed simply an opaque 16-bit device + * number, the domain creation tools conventionally allocate these + * numbers to correspond to those used by 'real' linux -- this is just + * for convenience as it means e.g. that the same /etc/fstab can be + * used when booting with or without Xen. + */ +int xlvbd_init(void) +{ + int i; + + /* + * If compiled as a module, we don't support unloading yet. We + * therefore permanently increment the reference count to + * disallow it. + */ + /* MOD_INC_USE_COUNT; */ + + memset(major_info, 0, sizeof(major_info)); + + for (i = 0; i < sizeof(major_info) / sizeof(major_info[0]); i++) { + } + + vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); + nr_vbds = xlvbd_get_vbd_info(vbd_info); + + if (nr_vbds < 0) { + kfree(vbd_info); + vbd_info = NULL; + nr_vbds = 0; + } else { + for (i = 0; i < nr_vbds; i++) + xlvbd_init_device(&vbd_info[i]); + } + + return 0; +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/console/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/console/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/console/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/console/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := console.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/console/console.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/console/console.c --- pristine-linux-2.6.10-rc3/drivers/xen/console/console.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/console/console.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,790 @@ +/****************************************************************************** + * console.c + * + * Virtual console driver. + * + * Copyright (c) 2002-2004, K A Fraser. + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Modes: + * 'xencons=off' [XC_OFF]: Console is disabled. + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. + * [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY. + * + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses + * warnings from standard distro startup scripts. + */ +static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT; + +static int __init xencons_setup(char *str) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + if (str[0] == '=') + str++; +#endif + if ( !strcmp(str, "tty") ) + xc_mode = XC_TTY; + else if ( !strcmp(str, "ttyS") ) + xc_mode = XC_SERIAL; + else if ( !strcmp(str, "off") ) + xc_mode = XC_OFF; + return 1; +} +__setup("xencons", xencons_setup); + +/* The kernel and user-land drivers share a common transmit buffer. */ +#define WBUF_SIZE 4096 +#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) +static char wbuf[WBUF_SIZE]; +static unsigned int wc, wp; /* write_cons, write_prod */ + +/* This lock protects accesses to the common transmit buffer. */ +static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED; + +/* Common transmit-kick routine. */ +static void __xencons_tx_flush(void); + +/* This task is used to defer sending console data until there is space. */ +static void xencons_tx_flush_task_routine(void *data); + +static DECLARE_TQUEUE(xencons_tx_flush_task, + xencons_tx_flush_task_routine, + NULL); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +static struct tty_driver *xencons_driver; +#else +static struct tty_driver xencons_driver; +#endif + + +/******************** Kernel console driver ********************************/ + +static void kcons_write( + struct console *c, const char *s, unsigned int count) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + for ( i = 0; i < count; i++ ) + { + if ( (wp - wc) >= (WBUF_SIZE - 1) ) + break; + if ( (wbuf[WBUF_MASK(wp++)] = s[i]) == '\n' ) + wbuf[WBUF_MASK(wp++)] = '\r'; + } + + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void kcons_write_dom0( + struct console *c, const char *s, unsigned int count) +{ + int rc; + + while ( count > 0 ) + { + if ( (rc = HYPERVISOR_console_io(CONSOLEIO_write, + count, (char *)s)) > 0 ) + { + count -= rc; + s += rc; + } + else + break; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +static struct tty_driver *kcons_device(struct console *c, int *index) +{ + *index = c->index; + return xencons_driver; +} +#else +static kdev_t kcons_device(struct console *c) +{ + return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1); +} +#endif + +static struct console kcons_info = { + device: kcons_device, + flags: CON_PRINTBUFFER, + index: -1 +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define __RETCODE 0 +static int __init xen_console_init(void) +#else +#define __RETCODE +void xen_console_init(void) +#endif +{ + if ( xen_start_info.flags & SIF_INITDOMAIN ) + { + if ( xc_mode == XC_DEFAULT ) + xc_mode = XC_SERIAL; + kcons_info.write = kcons_write_dom0; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + if ( xc_mode == XC_SERIAL ) + kcons_info.flags |= CON_ENABLED; +#endif + } + else + { + if ( xc_mode == XC_DEFAULT ) + xc_mode = XC_TTY; + kcons_info.write = kcons_write; + } + + if ( xc_mode == XC_OFF ) + return __RETCODE; + + if ( xc_mode == XC_SERIAL ) + strcpy(kcons_info.name, "ttyS"); + else + strcpy(kcons_info.name, "tty"); + + register_console(&kcons_info); + return __RETCODE; +} +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +console_initcall(xen_console_init); +#endif + +/*** Useful function for console debugging -- goes straight to Xen. ***/ +asmlinkage int xprintk(const char *fmt, ...) +{ + va_list args; + int printk_len; + static char printk_buf[1024]; + + /* Emit the output into the temporary buffer */ + va_start(args, fmt); + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); + va_end(args); + + /* Send the processed output directly to Xen. */ + kcons_write_dom0(NULL, printk_buf, printk_len); + + return 0; +} + +/*** Forcibly flush console data before dying. ***/ +void xencons_force_flush(void) +{ + ctrl_msg_t msg; + int sz; + + /* Emergency console is synchronous, so there's nothing to flush. */ + if ( xen_start_info.flags & SIF_INITDOMAIN ) + return; + + /* + * We use dangerous control-interface functions that require a quiescent + * system and no interrupts. Try to ensure this with a global cli(). + */ + cli(); + + /* Spin until console data is flushed through to the domain controller. */ + while ( (wc != wp) && !ctrl_if_transmitter_empty() ) + { + /* Interrupts are disabled -- we must manually reap responses. */ + ctrl_if_discard_responses(); + + if ( (sz = wp - wc) == 0 ) + continue; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + wc += sz; + } +} + + +/******************** User-space console driver (/dev/console) ************/ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define DRV(_d) (_d) +#define TTY_INDEX(_tty) ((_tty)->index) +#else +static int xencons_refcount; +static struct tty_struct *xencons_table[MAX_NR_CONSOLES]; +#define DRV(_d) (&(_d)) +#define TTY_INDEX(_tty) (MINOR((_tty)->device) - xencons_driver.minor_start) +#endif + +static struct termios *xencons_termios[MAX_NR_CONSOLES]; +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; +static struct tty_struct *xencons_tty; +static int xencons_priv_irq; +static char x_char; + +/* Non-privileged receive callback. */ +static void xencons_rx(ctrl_msg_t *msg, unsigned long id) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + if ( xencons_tty != NULL ) + { + for ( i = 0; i < msg->length; i++ ) + tty_insert_flip_char(xencons_tty, msg->msg[i], 0); + tty_flip_buffer_push(xencons_tty); + } + spin_unlock_irqrestore(&xencons_lock, flags); + + msg->length = 0; + ctrl_if_send_response(msg); +} + +/* Privileged and non-privileged transmit worker. */ +static void __xencons_tx_flush(void) +{ + int sz, work_done = 0; + ctrl_msg_t msg; + + if ( xen_start_info.flags & SIF_INITDOMAIN ) + { + if ( x_char ) + { + kcons_write_dom0(NULL, &x_char, 1); + x_char = 0; + work_done = 1; + } + + while ( wc != wp ) + { + sz = wp - wc; + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); + wc += sz; + work_done = 1; + } + } + else + { + while ( x_char ) + { + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = 1; + msg.msg[0] = x_char; + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + x_char = 0; + else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) + break; + + work_done = 1; + } + + while ( wc != wp ) + { + sz = wp - wc; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + wc += sz; + else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) + break; + + work_done = 1; + } + } + + if ( work_done && (xencons_tty != NULL) ) + { + wake_up_interruptible(&xencons_tty->write_wait); + if ( (xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + (xencons_tty->ldisc.write_wakeup != NULL) ) + (xencons_tty->ldisc.write_wakeup)(xencons_tty); + } +} + +/* Non-privileged transmit kicker. */ +static void xencons_tx_flush_task_routine(void *data) +{ + unsigned long flags; + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +/* Privileged receive callback and transmit kicker. */ +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + static char rbuf[16]; + int i, l; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + if ( xencons_tty != NULL ) + { + /* Receive work. */ + while ( (l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0 ) + for ( i = 0; i < l; i++ ) + tty_insert_flip_char(xencons_tty, rbuf[i], 0); + if ( xencons_tty->flip.count != 0 ) + tty_flip_buffer_push(xencons_tty); + } + + /* Transmit work. */ + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return IRQ_HANDLED; +} + +static int xencons_write_room(struct tty_struct *tty) +{ + return WBUF_SIZE - (wp - wc); +} + +static int xencons_chars_in_buffer(struct tty_struct *tty) +{ + return wp - wc; +} + +static void xencons_send_xchar(struct tty_struct *tty, char ch) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + x_char = ch; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_throttle(struct tty_struct *tty) +{ + if ( TTY_INDEX(tty) != 0 ) + return; + + if ( I_IXOFF(tty) ) + xencons_send_xchar(tty, STOP_CHAR(tty)); +} + +static void xencons_unthrottle(struct tty_struct *tty) +{ + if ( TTY_INDEX(tty) != 0 ) + return; + + if ( I_IXOFF(tty) ) + { + if ( x_char != 0 ) + x_char = 0; + else + xencons_send_xchar(tty, START_CHAR(tty)); + } +} + +static void xencons_flush_buffer(struct tty_struct *tty) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + wc = wp = 0; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static inline int __xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ( (wp - wc) == WBUF_SIZE ) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +static int xencons_write(struct tty_struct *tty, const unsigned char *buf, + int count) +{ + int i; + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for ( i = 0; i < count; i++ ) + if ( !__xencons_put_char(buf[i]) ) + break; + + if ( i != 0 ) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; +} +#else +static int xencons_write(struct tty_struct *tty, int from_user, + const u_char *buf, int count) +{ + int i; + unsigned long flags; + + if ( from_user && verify_area(VERIFY_READ, buf, count) ) + return -EINVAL; + + if ( TTY_INDEX(tty) != 0 ) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for ( i = 0; i < count; i++ ) + { + char ch; + if ( from_user ) + __get_user(ch, buf + i); + else + ch = buf[i]; + if ( !__xencons_put_char(ch) ) + break; + } + + if ( i != 0 ) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; +} +#endif + +static void xencons_put_char(struct tty_struct *tty, u_char ch) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + (void)__xencons_put_char(ch); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_flush_chars(struct tty_struct *tty) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) +{ + unsigned long orig_jiffies = jiffies; + + if ( TTY_INDEX(tty) != 0 ) + return; + + while ( DRV(tty->driver)->chars_in_buffer(tty) ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + if ( signal_pending(current) ) + break; + if ( (timeout != 0) && time_after(jiffies, orig_jiffies + timeout) ) + break; + } + + set_current_state(TASK_RUNNING); +} + +static int xencons_open(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return 0; + + spin_lock_irqsave(&xencons_lock, flags); + tty->driver_data = NULL; + if ( xencons_tty == NULL ) + xencons_tty = tty; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); + + return 0; +} + +static void xencons_close(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if ( TTY_INDEX(tty) != 0 ) + return; + + if ( tty->count == 1 ) + { + tty->closing = 1; + tty_wait_until_sent(tty, 0); + if ( DRV(tty->driver)->flush_buffer != NULL ) + DRV(tty->driver)->flush_buffer(tty); + if ( tty->ldisc.flush_buffer != NULL ) + tty->ldisc.flush_buffer(tty); + tty->closing = 0; + spin_lock_irqsave(&xencons_lock, flags); + xencons_tty = NULL; + spin_unlock_irqrestore(&xencons_lock, flags); + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +static struct tty_operations xencons_ops = { + .open = xencons_open, + .close = xencons_close, + .write = xencons_write, + .write_room = xencons_write_room, + .put_char = xencons_put_char, + .flush_chars = xencons_flush_chars, + .chars_in_buffer = xencons_chars_in_buffer, + .send_xchar = xencons_send_xchar, + .flush_buffer = xencons_flush_buffer, + .throttle = xencons_throttle, + .unthrottle = xencons_unthrottle, + .wait_until_sent = xencons_wait_until_sent, +}; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static const char *xennullcon_startup(void) +{ + return NULL; +} + +static int xennullcon_dummy(void) +{ + return 0; +} + +#define DUMMY (void *)xennullcon_dummy + +/* + * The console `switch' structure for the dummy console + * + * Most of the operations are dummies. + */ + +const struct consw xennull_con = { + .owner = THIS_MODULE, + .con_startup = xennullcon_startup, + .con_init = DUMMY, + .con_deinit = DUMMY, + .con_clear = DUMMY, + .con_putc = DUMMY, + .con_putcs = DUMMY, + .con_cursor = DUMMY, + .con_scroll = DUMMY, + .con_bmove = DUMMY, + .con_switch = DUMMY, + .con_blank = DUMMY, + .con_font_set = DUMMY, + .con_font_get = DUMMY, + .con_font_default = DUMMY, + .con_font_copy = DUMMY, + .con_set_palette = DUMMY, + .con_scrolldelta = DUMMY, +}; +#endif +#endif + +static int __init xencons_init(void) +{ + int rc; + + if ( xc_mode == XC_OFF ) + return 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ? + 1 : MAX_NR_CONSOLES); + if ( xencons_driver == NULL ) + return -ENOMEM; +#else + memset(&xencons_driver, 0, sizeof(struct tty_driver)); + xencons_driver.magic = TTY_DRIVER_MAGIC; + xencons_driver.refcount = &xencons_refcount; + xencons_driver.table = xencons_table; + xencons_driver.num = (xc_mode == XC_SERIAL) ? 1 : MAX_NR_CONSOLES; +#endif + + DRV(xencons_driver)->major = TTY_MAJOR; + DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; + DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; + DRV(xencons_driver)->init_termios = tty_std_termios; + DRV(xencons_driver)->flags = + TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_NO_DEVFS; + DRV(xencons_driver)->termios = xencons_termios; + DRV(xencons_driver)->termios_locked = xencons_termios_locked; + + if ( xc_mode == XC_SERIAL ) + { + DRV(xencons_driver)->name = "ttyS"; + DRV(xencons_driver)->minor_start = 64; + DRV(xencons_driver)->name_base = 0; + } + else + { + DRV(xencons_driver)->name = "tty"; + DRV(xencons_driver)->minor_start = 1; + DRV(xencons_driver)->name_base = 1; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + tty_set_operations(xencons_driver, &xencons_ops); +#else + xencons_driver.open = xencons_open; + xencons_driver.close = xencons_close; + xencons_driver.write = xencons_write; + xencons_driver.write_room = xencons_write_room; + xencons_driver.put_char = xencons_put_char; + xencons_driver.flush_chars = xencons_flush_chars; + xencons_driver.chars_in_buffer = xencons_chars_in_buffer; + xencons_driver.send_xchar = xencons_send_xchar; + xencons_driver.flush_buffer = xencons_flush_buffer; + xencons_driver.throttle = xencons_throttle; + xencons_driver.unthrottle = xencons_unthrottle; + xencons_driver.wait_until_sent = xencons_wait_until_sent; +#endif + + if ( (rc = tty_register_driver(DRV(xencons_driver))) != 0 ) + { + printk("Couldn't register Xen virtual console driver as %s\n", + DRV(xencons_driver)->name); + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + tty_register_device(xencons_driver, 0, NULL); +#endif + + if ( xen_start_info.flags & SIF_INITDOMAIN ) + { + xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE); + (void)request_irq(xencons_priv_irq, + xencons_priv_interrupt, 0, "console", NULL); + } + else + { + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); + } + + printk("Xen virtual console successfully installed as %s\n", + DRV(xencons_driver)->name); + + return 0; +} + +static void __exit xencons_fini(void) +{ + int ret; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + tty_unregister_device(xencons_driver, 0); +#endif + + if ( (ret = tty_unregister_driver(DRV(xencons_driver))) != 0 ) + printk(KERN_ERR "Unable to unregister Xen console driver: %d\n", ret); + + if ( xen_start_info.flags & SIF_INITDOMAIN ) + { + free_irq(xencons_priv_irq, NULL); + unbind_virq_from_irq(VIRQ_CONSOLE); + } + else + { + ctrl_if_unregister_receiver(CMSG_CONSOLE, xencons_rx); + } +} + +module_init(xencons_init); +module_exit(xencons_fini); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/evtchn/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/evtchn/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/evtchn/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/evtchn/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := evtchn.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/evtchn/evtchn.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/evtchn/evtchn.c --- pristine-linux-2.6.10-rc3/drivers/xen/evtchn/evtchn.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/evtchn/evtchn.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,429 @@ +/****************************************************************************** + * evtchn.c + * + * Xenolinux driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#include +#define OLD_DEVFS +#else +#include +#endif + +#ifdef OLD_DEVFS +/* NB. This must be shared amongst drivers if more things go in /dev/xen */ +static devfs_handle_t xen_dev_dir; +#endif + +struct per_user_data { + /* Notification ring, accessed via /dev/xen/evtchn. */ +# define RING_SIZE 2048 /* 2048 16-bit entries */ +# define RING_MASK(_i) ((_i)&(RING_SIZE-1)) + u16 *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; +}; + +/* Who's bound to each port? */ +static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +static spinlock_t port_user_lock; + +void evtchn_device_upcall(int port) +{ + struct per_user_data *u; + + spin_lock(&port_user_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ( (u = port_user[port]) != NULL ) + { + if ( (u->ring_prod - u->ring_cons) < RING_SIZE ) + { + u->ring[RING_MASK(u->ring_prod)] = (u16)port; + if ( u->ring_cons == u->ring_prod++ ) + { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); + } + } + else + { + u->ring_overflow = 1; + } + } + + spin_unlock(&port_user_lock); +} + +static ssize_t evtchn_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + DECLARE_WAITQUEUE(wait, current); + struct per_user_data *u = file->private_data; + + add_wait_queue(&u->evtchn_wait, &wait); + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + for ( ; ; ) + { + set_current_state(TASK_INTERRUPTIBLE); + + if ( (c = u->ring_cons) != (p = u->ring_prod) ) + break; + + if ( u->ring_overflow ) + { + rc = -EFBIG; + goto out; + } + + if ( file->f_flags & O_NONBLOCK ) + { + rc = -EAGAIN; + goto out; + } + + if ( signal_pending(current) ) + { + rc = -ERESTARTSYS; + goto out; + } + + schedule(); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if ( ((c ^ p) & RING_SIZE) != 0 ) + { + bytes1 = (RING_SIZE - RING_MASK(c)) * sizeof(u16); + bytes2 = RING_MASK(p) * sizeof(u16); + } + else + { + bytes1 = (p - c) * sizeof(u16); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if ( bytes1 > count ) + { + bytes1 = count; + bytes2 = 0; + } + else if ( (bytes1 + bytes2) > count ) + { + bytes2 = count - bytes1; + } + + if ( copy_to_user(buf, &u->ring[RING_MASK(c)], bytes1) || + ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2)) ) + { + rc = -EFAULT; + goto out; + } + + u->ring_cons += (bytes1 + bytes2) / sizeof(u16); + + rc = bytes1 + bytes2; + + out: + __set_current_state(TASK_RUNNING); + remove_wait_queue(&u->evtchn_wait, &wait); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + u16 *kbuf = (u16 *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if ( kbuf == NULL ) + return -ENOMEM; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + if ( copy_from_user(kbuf, buf, count) != 0 ) + { + rc = -EFAULT; + goto out; + } + + spin_lock_irq(&port_user_lock); + for ( i = 0; i < (count/2); i++ ) + if ( (kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u) ) + unmask_evtchn(kbuf[i]); + spin_unlock_irq(&port_user_lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct per_user_data *u = file->private_data; + + spin_lock_irq(&port_user_lock); + + switch ( cmd ) + { + case EVTCHN_RESET: + /* Initialise the ring to empty. Clear errors. */ + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + break; + + case EVTCHN_BIND: + if ( arg >= NR_EVENT_CHANNELS ) + { + rc = -EINVAL; + } + else if ( port_user[arg] != NULL ) + { + rc = -EISCONN; + } + else + { + port_user[arg] = u; + unmask_evtchn(arg); + } + break; + + case EVTCHN_UNBIND: + if ( arg >= NR_EVENT_CHANNELS ) + { + rc = -EINVAL; + } + else if ( port_user[arg] != u ) + { + rc = -ENOTCONN; + } + else + { + port_user[arg] = NULL; + mask_evtchn(arg); + } + break; + + default: + rc = -ENOSYS; + break; + } + + spin_unlock_irq(&port_user_lock); + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if ( u->ring_cons != u->ring_prod ) + mask |= POLLIN | POLLRDNORM; + if ( u->ring_overflow ) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + if ( (u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL ) + return -ENOMEM; + + memset(u, 0, sizeof(*u)); + init_waitqueue_head(&u->evtchn_wait); + + if ( (u->ring = (u16 *)__get_free_page(GFP_KERNEL)) == NULL ) + { + kfree(u); + return -ENOMEM; + } + + filp->private_data = u; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + struct per_user_data *u = filp->private_data; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + { + if ( port_user[i] == u ) + { + port_user[i] = NULL; + mask_evtchn(i); + } + } + + spin_unlock_irq(&port_user_lock); + + return 0; +} + +static struct file_operations evtchn_fops = { + owner: THIS_MODULE, + read: evtchn_read, + write: evtchn_write, + ioctl: evtchn_ioctl, + poll: evtchn_poll, + fasync: evtchn_fasync, + open: evtchn_open, + release: evtchn_release +}; + +static struct miscdevice evtchn_miscdev = { + .minor = EVTCHN_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .devfs_name = "misc/evtchn", +#endif +}; + +static int __init evtchn_init(void) +{ +#ifdef OLD_DEVFS + devfs_handle_t symlink_handle; + int pos; + char link_dest[64]; +#endif + int err; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* (DEVFS) create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if ( err != 0 ) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + +#ifdef OLD_DEVFS + /* (DEVFS) create directory '/dev/xen'. */ + xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); + + /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ + pos = devfs_generate_path(evtchn_miscdev.devfs_handle, + &link_dest[3], + sizeof(link_dest) - 3); + if ( pos >= 0 ) + strncpy(&link_dest[pos], "../", 3); + + /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ + (void)devfs_mk_symlink(xen_dev_dir, + "evtchn", + DEVFS_FL_DEFAULT, + &link_dest[pos], + &symlink_handle, + NULL); + + /* (DEVFS) automatically destroy the symlink with its destination. */ + devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); +#endif + + printk("Event-channel device installed.\n"); + + return 0; +} + +static void evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netback/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/netback/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := netback.o control.o interface.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netback/common.h tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/common.h --- pristine-linux-2.6.10-rc3/drivers/xen/netback/common.h 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/common.h 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,99 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/common.h + */ + +#ifndef __NETIF__BACKEND__COMMON_H__ +#define __NETIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct netif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + + /* Physical parameters of the comms window. */ + unsigned long tx_shmem_frame; + unsigned long rx_shmem_frame; + unsigned int evtchn; + int irq; + + /* The shared rings and indexes. */ + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + /* Private indexes into shared ring. */ + NETIF_RING_IDX rx_req_cons; + NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ + NETIF_RING_IDX tx_req_cons; + NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ + + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ + unsigned long credit_bytes; + unsigned long credit_usec; + unsigned long remaining_credit; + struct timer_list credit_timeout; + + /* Miscellaneous private stuff. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct netif_st *hash_next; + struct list_head list; /* scheduling list */ + atomic_t refcnt; + spinlock_t rx_lock, tx_lock; + struct net_device *dev; + struct net_device_stats stats; + + struct work_struct work; +} netif_t; + +void netif_create(netif_be_create_t *create); +void netif_destroy(netif_be_destroy_t *destroy); +void netif_connect(netif_be_connect_t *connect); +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id); +void netif_disconnect_complete(netif_t *netif); +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle); +#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define netif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + netif_disconnect_complete(_b); \ + } while (0) + +void netif_interface_init(void); +void netif_ctrlif_init(void); + +void netif_deschedule(netif_t *netif); + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); +struct net_device_stats *netif_be_get_stats(struct net_device *dev); +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __NETIF__BACKEND__COMMON_H__ */ diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netback/control.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/control.c --- pristine-linux-2.6.10-rc3/drivers/xen/netback/control.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/control.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,65 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_NETIF_BE_CREATE: + if ( msg->length != sizeof(netif_be_create_t) ) + goto parse_error; + netif_create((netif_be_create_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DESTROY: + if ( msg->length != sizeof(netif_be_destroy_t) ) + goto parse_error; + netif_destroy((netif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_CONNECT: + if ( msg->length != sizeof(netif_be_connect_t) ) + goto parse_error; + netif_connect((netif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DISCONNECT: + if ( msg->length != sizeof(netif_be_disconnect_t) ) + goto parse_error; + if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void netif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + netif_be_driver_status_t st; + + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS; + cmsg.length = sizeof(netif_be_driver_status_t); + st.status = NETIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netback/interface.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/interface.c --- pristine-linux-2.6.10-rc3/drivers/xen/netback/interface.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/interface.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,297 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/interface.c + * + * Network-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define VMALLOC_VMADDR(x) ((unsigned long)(x)) +#endif + +#define NETIF_HASHSZ 1024 +#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1)) + +static netif_t *netif_hash[NETIF_HASHSZ]; + +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) +{ + netif_t *netif = netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif != NULL) && + ((netif->domid != domid) || (netif->handle != handle)) ) + netif = netif->hash_next; + return netif; +} + +static void __netif_disconnect_complete(void *arg) +{ + netif_t *netif = (netif_t *)arg; + ctrl_msg_t cmsg; + netif_be_disconnect_t disc; + + /* + * These can't be done in netif_disconnect() because at that point there + * may be outstanding requests in the network stack whose asynchronous + * responses must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(netif->evtchn); + vfree(netif->tx); /* Frees netif->rx as well. */ + rtnl_lock(); + (void)dev_close(netif->dev); + rtnl_unlock(); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DISCONNECT; + cmsg.id = netif->disconnect_rspid; + cmsg.length = sizeof(netif_be_disconnect_t); + disc.domid = netif->domid; + disc.netif_handle = netif->handle; + disc.status = NETIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'netif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( netif->status != DISCONNECTING ) + BUG(); + netif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void netif_disconnect_complete(netif_t *netif) +{ + INIT_WORK(&netif->work, __netif_disconnect_complete, (void *)netif); + schedule_work(&netif->work); +} + +void netif_create(netif_be_create_t *create) +{ + int err = 0; + domid_t domid = create->domid; + unsigned int handle = create->netif_handle; + struct net_device *dev; + netif_t **pnetif, *netif; + char name[IFNAMSIZ]; + + snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); + dev = alloc_netdev(sizeof(netif_t), name, ether_setup); + if ( dev == NULL ) + { + DPRINTK("Could not create netif: out of memory\n"); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + netif = dev->priv; + memset(netif, 0, sizeof(*netif)); + netif->domid = domid; + netif->handle = handle; + netif->status = DISCONNECTED; + spin_lock_init(&netif->rx_lock); + spin_lock_init(&netif->tx_lock); + atomic_set(&netif->refcnt, 0); + netif->dev = dev; + + netif->credit_bytes = netif->remaining_credit = ~0UL; + netif->credit_usec = 0UL; + /*init_ac_timer(&new_vif->credit_timeout);*/ + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( *pnetif != NULL ) + { + if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) ) + { + DPRINTK("Could not create netif: already exists\n"); + create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; + free_netdev(dev); + return; + } + pnetif = &(*pnetif)->hash_next; + } + + dev->hard_start_xmit = netif_be_start_xmit; + dev->get_stats = netif_be_get_stats; + memcpy(dev->dev_addr, create->mac, ETH_ALEN); + + /* Disable queuing. */ + dev->tx_queue_len = 0; + + /* Force a different MAC from remote end. */ + dev->dev_addr[2] ^= 1; + + if ( (err = register_netdev(dev)) != 0 ) + { + DPRINTK("Could not register new net device %s: err=%d\n", + dev->name, err); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + free_netdev(dev); + return; + } + + netif->hash_next = *pnetif; + *pnetif = netif; + + DPRINTK("Successfully created netif\n"); + create->status = NETIF_BE_STATUS_OKAY; +} + +void netif_destroy(netif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->netif_handle; + netif_t **pnetif, *netif; + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif = *pnetif) != NULL ) + { + if ( (netif->domid == domid) && (netif->handle == handle) ) + { + if ( netif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pnetif = &netif->hash_next; + } + + destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pnetif = netif->hash_next; + unregister_netdev(netif->dev); + free_netdev(netif->dev); + destroy->status = NETIF_BE_STATUS_OKAY; +} + +void netif_connect(netif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->netif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long tx_shmem_frame = connect->tx_shmem_frame; + unsigned long rx_shmem_frame = connect->rx_shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + netif_t *netif; +#if 0 + struct net_device *eth0_dev; +#endif + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", + connect->domid, connect->netif_handle); + connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( netif->status != DISCONNECTED ) + { + connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + } + + if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, + VMALLOC_VMADDR(vma->addr), + tx_shmem_frame<addr) + PAGE_SIZE, + rx_shmem_frame<status = NETIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = NETIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = NETIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + netif->evtchn = evtchn; + netif->irq = bind_evtchn_to_irq(evtchn); + netif->tx_shmem_frame = tx_shmem_frame; + netif->rx_shmem_frame = rx_shmem_frame; + netif->tx = + (netif_tx_interface_t *)vma->addr; + netif->rx = + (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); + netif->status = CONNECTED; + netif_get(netif); + + netif->tx->resp_prod = netif->rx->resp_prod = 0; + + rtnl_lock(); + (void)dev_open(netif->dev); + rtnl_unlock(); + + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); + netif_start_queue(netif->dev); + + connect->status = NETIF_BE_STATUS_OKAY; +} + +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->netif_handle; + netif_t *netif; + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_disconnect attempted for non-existent netif" + " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); + disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( netif->status == CONNECTED ) + { + netif->status = DISCONNECTING; + netif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + netif_stop_queue(netif->dev); + free_irq(netif->irq, netif); + netif_deschedule(netif); + netif_put(netif); + return 0; /* Caller should not send response message. */ + } + + disconnect->status = NETIF_BE_STATUS_OKAY; + return 1; +} + +void netif_interface_init(void) +{ + memset(netif_hash, 0, sizeof(netif_hash)); +} diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netback/netback.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/netback.c --- pristine-linux-2.6.10-rc3/drivers/xen/netback/netback.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netback/netback.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,838 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/netif/frontend + * + * Copyright (c) 2002-2004, K A Fraser + */ + +#include "common.h" +#include + +static void netif_page_release(struct page *page); +static void netif_skb_release(struct sk_buff *skb); +static void make_tx_response(netif_t *netif, + u16 id, + s8 st); +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size); + +static void net_tx_action(unsigned long unused); +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); + +static void net_rx_action(unsigned long unused); +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); + +static struct timer_list net_timer; + +static struct sk_buff_head rx_queue; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3]; +static unsigned char rx_notify[NR_EVENT_CHANNELS]; + +/* Don't currently gate addition of an interface to the tx scheduling list. */ +#define tx_work_exists(_if) (1) + +#define MAX_PENDING_REQS 256 +static unsigned long mmap_vstart; +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) + +#define PKT_PROT_LEN 64 + +static struct { + netif_tx_request_t req; + netif_t *netif; +} pending_tx_info[MAX_PENDING_REQS]; +static u16 pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +/* Freed TX SKBs get batched on this ring before return to pending_ring. */ +static u16 dealloc_ring[MAX_PENDING_REQS]; +static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +static struct sk_buff_head tx_queue; +static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; + +static struct list_head net_schedule_list; +static spinlock_t net_schedule_list_lock; + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; + +static unsigned long alloc_mfn(void) +{ + unsigned long mfn = 0, flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( unlikely(alloc_index == 0) ) + alloc_index = HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, mfn_list, MAX_MFN_ALLOC, 0); + if ( alloc_index != 0 ) + mfn = mfn_list[--alloc_index]; + spin_unlock_irqrestore(&mfn_lock, flags); + return mfn; +} + +static void free_mfn(unsigned long mfn) +{ + unsigned long flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index != MAX_MFN_ALLOC ) + mfn_list[alloc_index++] = mfn; + else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, + &mfn, 1, 0) != 1 ) + BUG(); + spin_unlock_irqrestore(&mfn_lock, flags); +} + +static inline void maybe_schedule_tx_action(void) +{ + smp_mb(); + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list) ) + tasklet_schedule(&net_tx_tasklet); +} + +/* + * A gross way of confirming the origin of an skb data page. The slab + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr. + */ +static inline int is_xen_skb(struct sk_buff *skb) +{ + extern kmem_cache_t *skbuff_cachep; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; +#else + kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->list.next; +#endif + return (cp == skbuff_cachep); +} + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + netif_t *netif = (netif_t *)dev->priv; + + ASSERT(skb->dev == dev); + + /* Drop the packet if the target domain has no receive buffers. */ + if ( (netif->rx_req_cons == netif->rx->req_prod) || + ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) + goto drop; + + /* + * We do not copy the packet unless: + * 1. The data is shared; or + * 2. The data is not allocated from our special cache. + * NB. We also couldn't cope with fragmented packets, but we won't get + * any because we not advertise the NETIF_F_SG feature. + */ + if ( skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb) ) + { + int hlen = skb->data - skb->head; + struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len); + if ( unlikely(nskb == NULL) ) + goto drop; + skb_reserve(nskb, hlen); + __skb_put(nskb, skb->len); + (void)skb_copy_bits(skb, -hlen, nskb->data - hlen, skb->len + hlen); + nskb->dev = skb->dev; + dev_kfree_skb(skb); + skb = nskb; + } + + netif->rx_req_cons++; + + skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; + + drop: + netif->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +#if 0 +static void xen_network_done_notify(void) +{ + static struct net_device *eth0_dev = NULL; + if ( unlikely(eth0_dev == NULL) ) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); +} +/* + * Add following to poll() function in NAPI driver (Tigon3 is example): + * if ( xen_network_done() ) + * tg3_enable_ints(tp); + */ +int xen_network_done(void) +{ + return skb_queue_empty(&rx_queue); +} +#endif + +static void net_rx_action(unsigned long unused) +{ + netif_t *netif; + s8 status; + u16 size, id, evtchn; + mmu_update_t *mmu; + multicall_entry_t *mcl; + unsigned long vdata, mdata, new_mfn; + struct sk_buff_head rxq; + struct sk_buff *skb; + u16 notify_list[NETIF_RX_RING_SIZE]; + int notify_nr = 0; + + skb_queue_head_init(&rxq); + + mcl = rx_mcl; + mmu = rx_mmu; + while ( (skb = skb_dequeue(&rx_queue)) != NULL ) + { + netif = (netif_t *)skb->dev->priv; + vdata = (unsigned long)skb->data; + mdata = virt_to_machine(vdata); + + /* Memory squeeze? Back off for an arbitrary while. */ + if ( (new_mfn = alloc_mfn()) == 0 ) + { + if ( net_ratelimit() ) + printk(KERN_WARNING "Memory squeeze in netback driver.\n"); + mod_timer(&net_timer, jiffies + HZ); + break; + } + + /* + * Set the new P2M table entry before reassigning the old data page. + * Heed the comment in pgtable-2level.h:pte_page(). :-) + */ + phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; + + mmu[0].ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu[0].val = __pa(vdata) >> PAGE_SHIFT; + mmu[1].ptr = MMU_EXTENDED_COMMAND; + mmu[1].val = MMUEXT_SET_FOREIGNDOM; + mmu[1].val |= (unsigned long)netif->domid << 16; + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; + mmu[2].val = MMUEXT_REASSIGN_PAGE; + + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = vdata >> PAGE_SHIFT; + mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[1].op = __HYPERVISOR_mmu_update; + mcl[1].args[0] = (unsigned long)mmu; + mcl[1].args[1] = 3; + mcl[1].args[2] = 0; + + mcl += 2; + mmu += 3; + + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) + break; + } + + if ( mcl == rx_mcl ) + return; + + mcl[-2].args[2] = UVMF_FLUSH_TLB; + if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) ) + BUG(); + + mcl = rx_mcl; + mmu = rx_mmu; + while ( (skb = __skb_dequeue(&rxq)) != NULL ) + { + netif = (netif_t *)skb->dev->priv; + size = skb->tail - skb->data; + + /* Rederive the machine addresses. */ + new_mfn = mcl[0].args[1] >> PAGE_SHIFT; + mdata = ((mmu[2].ptr & PAGE_MASK) | + ((unsigned long)skb->data & ~PAGE_MASK)); + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + netif->stats.tx_bytes += size; + netif->stats.tx_packets++; + + /* The update_va_mapping() must not fail. */ + if ( unlikely(mcl[0].args[5] != 0) ) + BUG(); + + /* Check the reassignment error code. */ + status = NETIF_RSP_OKAY; + if ( unlikely(mcl[1].args[5] != 0) ) + { + DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); + free_mfn(mdata >> PAGE_SHIFT); + status = NETIF_RSP_ERROR; + } + + evtchn = netif->evtchn; + id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; + if ( make_rx_response(netif, id, status, mdata, size) && + (rx_notify[evtchn] == 0) ) + { + rx_notify[evtchn] = 1; + notify_list[notify_nr++] = evtchn; + } + + dev_kfree_skb(skb); + + mcl += 2; + mmu += 3; + } + + while ( notify_nr != 0 ) + { + evtchn = notify_list[--notify_nr]; + rx_notify[evtchn] = 0; + notify_via_evtchn(evtchn); + } + + /* More work to do? */ + if ( !skb_queue_empty(&rx_queue) && !timer_pending(&net_timer) ) + tasklet_schedule(&net_rx_tasklet); +#if 0 + else + xen_network_done_notify(); +#endif +} + +static void net_alarm(unsigned long unused) +{ + tasklet_schedule(&net_rx_tasklet); +} + +struct net_device_stats *netif_be_get_stats(struct net_device *dev) +{ + netif_t *netif = dev->priv; + return &netif->stats; +} + +static int __on_net_schedule_list(netif_t *netif) +{ + return netif->list.next != NULL; +} + +static void remove_from_net_schedule_list(netif_t *netif) +{ + spin_lock_irq(&net_schedule_list_lock); + if ( likely(__on_net_schedule_list(netif)) ) + { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static void add_to_net_schedule_list_tail(netif_t *netif) +{ + if ( __on_net_schedule_list(netif) ) + return; + + spin_lock_irq(&net_schedule_list_lock); + if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) ) + { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static inline void netif_schedule_work(netif_t *netif) +{ + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +void netif_deschedule(netif_t *netif) +{ + remove_from_net_schedule_list(netif); +} + +#if 0 +static void tx_credit_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + netif->remaining_credit = netif->credit_bytes; + netif_schedule_work(netif); +} +#endif + +static void net_tx_action(unsigned long unused) +{ + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + u16 pending_idx; + NETIF_RING_IDX i; + multicall_entry_t *mcl; + PEND_RING_IDX dc, dp; + unsigned int data_len; + + if ( (dc = dealloc_cons) == (dp = dealloc_prod) ) + goto skip_dealloc; + + mcl = tx_mcl; + while ( dc != dp ) + { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = 0; + mcl[0].args[2] = 0; + mcl++; + } + + mcl[-1].args[2] = UVMF_FLUSH_TLB; + if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) + BUG(); + + mcl = tx_mcl; + while ( dealloc_cons != dp ) + { + /* The update_va_mapping() must not fail. */ + if ( unlikely(mcl[0].args[5] != 0) ) + BUG(); + + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + spin_lock(&netif->tx_lock); + make_tx_response(netif, pending_tx_info[pending_idx].req.id, + NETIF_RSP_OKAY); + spin_unlock(&netif->tx_lock); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + /* + * Scheduling checks must happen after the above response is posted. + * This avoids a possible race with a guest OS on another CPU if that + * guest is testing against 'resp_prod' when deciding whether to notify + * us when it queues additional packets. + */ + mb(); + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + add_to_net_schedule_list_tail(netif); + + netif_put(netif); + + mcl++; + } + + skip_dealloc: + mcl = tx_mcl; + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list) ) + { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + /* Work to do? */ + i = netif->tx_req_cons; + if ( (i == netif->tx->req_prod) || + ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) + { + netif_put(netif); + continue; + } + + netif->tx->req_cons = ++netif->tx_req_cons; + + /* + * 1. Ensure that we see the request when we copy it. + * 2. Ensure that frontend sees updated req_cons before we check + * for more work to schedule. + */ + mb(); + + memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, + sizeof(txreq)); + +#if 0 + /* Credit-based scheduling. */ + if ( tx.size > netif->remaining_credit ) + { + s_time_t now = NOW(), next_credit = + netif->credit_timeout.expires + MICROSECS(netif->credit_usec); + if ( next_credit <= now ) + { + netif->credit_timeout.expires = now; + netif->remaining_credit = netif->credit_bytes; + } + else + { + netif->remaining_credit = 0; + netif->credit_timeout.expires = next_credit; + netif->credit_timeout.data = (unsigned long)netif; + netif->credit_timeout.function = tx_credit_callback; + netif->credit_timeout.cpu = smp_processor_id(); + add_ac_timer(&netif->credit_timeout); + break; + } + } + netif->remaining_credit -= tx.size; +#endif + + netif_schedule_work(netif); + + if ( unlikely(txreq.size < ETH_HLEN) || + unlikely(txreq.size > ETH_FRAME_LEN) ) + { + DPRINTK("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page boundary as the payload mustn't fragment. */ + if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) + { + DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", + txreq.addr, txreq.size, + (txreq.addr &~PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + data_len = txreq.size > PKT_PROT_LEN ? PKT_PROT_LEN : txreq.size; + + if ( unlikely((skb = alloc_skb(data_len+16, GFP_ATOMIC)) == NULL) ) + { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, 16); + + mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[0].args[3] = netif->domid; + mcl++; + + memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((u16 *)skb->data) = pending_idx; + + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + /* Filled the batch queue? */ + if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) + break; + } + + if ( mcl == tx_mcl ) + return; + + if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) + BUG(); + + mcl = tx_mcl; + while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) + { + pending_idx = *((u16 *)skb->data); + netif = pending_tx_info[pending_idx].netif; + memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); + + /* Check the remap error code. */ + if ( unlikely(mcl[0].args[5] != 0) ) + { + DPRINTK("Bad page frame\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + kfree_skb(skb); + mcl++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + continue; + } + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = + FOREIGN_FRAME(txreq.addr >> PAGE_SHIFT); + + data_len = txreq.size > PKT_PROT_LEN ? PKT_PROT_LEN : txreq.size; + + __skb_put(skb, data_len); + memcpy(skb->data, + (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), + data_len); + + if (data_len < txreq.size) { + /* Append the packet payload as a fragment. */ + skb_shinfo(skb)->frags[0].page = + virt_to_page(MMAP_VADDR(pending_idx)); + skb_shinfo(skb)->frags[0].size = txreq.size - data_len; + skb_shinfo(skb)->frags[0].page_offset = + (txreq.addr + data_len) & ~PAGE_MASK; + skb_shinfo(skb)->nr_frags = 1; + } else { + skb_shinfo(skb)->frags[0].page = + virt_to_page(MMAP_VADDR(pending_idx)); + skb->destructor = netif_skb_release; + } + + skb->data_len = txreq.size - data_len; + skb->len += skb->data_len; + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + netif->stats.rx_bytes += txreq.size; + netif->stats.rx_packets++; + + netif_rx(skb); + netif->dev->last_rx = jiffies; + + mcl++; + } +} + +static void netif_idx_release(u16 pending_idx) +{ + unsigned long flags; + + spin_lock_irqsave(&dealloc_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + spin_unlock_irqrestore(&dealloc_lock, flags); + + tasklet_schedule(&net_tx_tasklet); +} + +static void netif_page_release(struct page *page) +{ + u16 pending_idx = page - virt_to_page(mmap_vstart); + + /* Ready for next use. */ + set_page_count(page, 1); + + netif_idx_release(pending_idx); +} + +static void netif_skb_release(struct sk_buff *skb) +{ + struct page *page = skb_shinfo(skb)->frags[0].page; + u16 pending_idx = page - virt_to_page(mmap_vstart); + + netif_idx_release(pending_idx); +} + +#if 0 +long flush_bufs_for_netif(netif_t *netif) +{ + NETIF_RING_IDX i; + + /* Return any outstanding receive buffers to the guest OS. */ + spin_lock(&netif->rx_lock); + for ( i = netif->rx_req_cons; + (i != netif->rx->req_prod) && + ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE); + i++ ) + { + make_rx_response(netif, + netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id, + NETIF_RSP_DROPPED, 0, 0); + } + netif->rx_req_cons = i; + spin_unlock(&netif->rx_lock); + + /* + * Flush pending transmit buffers. The guest may still have to wait for + * buffers that are queued at a physical NIC. + */ + spin_lock(&netif->tx_lock); + for ( i = netif->tx_req_cons; + (i != netif->tx->req_prod) && + ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE); + i++ ) + { + make_tx_response(netif, + netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id, + NETIF_RSP_DROPPED); + } + netif->tx_req_cons = i; + spin_unlock(&netif->tx_lock); + + return 0; +} +#endif + +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + netif_t *netif = dev_id; + if ( tx_work_exists(netif) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } + return IRQ_HANDLED; +} + +static void make_tx_response(netif_t *netif, + u16 id, + s8 st) +{ + NETIF_RING_IDX i = netif->tx_resp_prod; + netif_tx_response_t *resp; + + resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; + resp->id = id; + resp->status = st; + wmb(); + netif->tx->resp_prod = netif->tx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + if ( i == netif->tx->event ) + notify_via_evtchn(netif->evtchn); +} + +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size) +{ + NETIF_RING_IDX i = netif->rx_resp_prod; + netif_rx_response_t *resp; + + resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + resp->addr = addr; + resp->id = id; + resp->status = (s16)size; + if ( st < 0 ) + resp->status = (s16)st; + wmb(); + netif->rx->resp_prod = netif->rx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + return (i == netif->rx->event); +} + +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +{ + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each ( ent, &net_schedule_list ) + { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", + i, netif->rx_req_cons, netif->rx_resp_prod); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx_req_cons, netif->tx_resp_prod); + printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", + netif->rx->req_prod, netif->rx->resp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx->event, netif->tx->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx->resp_prod, netif->tx->event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + + return IRQ_HANDLED; +} + +static int __init netback_init(void) +{ + int i; + struct page *page; + + if ( !(xen_start_info.flags & SIF_NET_BE_DOMAIN) && + !(xen_start_info.flags & SIF_INITDOMAIN) ) + return 0; + + printk("Initialising Xen netif backend\n"); + + /* We can increase reservation by this much in net_rx_action(). */ + balloon_update_driver_allowance(NETIF_RX_RING_SIZE); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + init_timer(&net_timer); + net_timer.data = 0; + net_timer.function = net_alarm; + + netif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) + BUG(); + + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + { + page = virt_to_page(MMAP_VADDR(i)); + set_page_count(page, 1); + SetPageForeign(page, netif_page_release); + } + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netif_ctrlif_init(); + + (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), + netif_be_dbg, SA_SHIRQ, + "net-be-dbg", &netif_be_dbg); + + return 0; +} + +static void netback_cleanup(void) +{ + BUG(); +} + +module_init(netback_init); +module_exit(netback_cleanup); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netfront/Kconfig tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/Kconfig --- pristine-linux-2.6.10-rc3/drivers/xen/netfront/Kconfig 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/Kconfig 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,6 @@ + +config XENNET + tristate "Xen network driver" + depends on NETDEVICES && ARCH_XEN + help + Network driver for Xen diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netfront/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/netfront/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := netfront.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/netfront/netfront.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/netfront.c --- pristine-linux-2.6.10-rc3/drivers/xen/netfront/netfront.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/netfront/netfront.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,1345 @@ +/****************************************************************************** + * Virtual network driver for conversing with remote driver backends. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG 0 + +#ifndef __GFP_NOWARN +#define __GFP_NOWARN 0 +#endif +#define alloc_xen_skb(_l) __dev_alloc_skb((_l), GFP_ATOMIC|__GFP_NOWARN) + +#define init_skb_shinfo(_skb) \ + do { \ + atomic_set(&(skb_shinfo(_skb)->dataref), 1); \ + skb_shinfo(_skb)->nr_frags = 0; \ + skb_shinfo(_skb)->frag_list = NULL; \ + } while ( 0 ) + +/* Allow headroom on each rx pkt for Ethernet header, alignment padding, ... */ +#define RX_HEADROOM 200 + +/* + * If the backend driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- only need to send a + * notification if there are no outstanding unreceived responses. + * If the backend may be buffering our transmit buffers for any reason then we + * are rather more conservative. + */ +#ifdef CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER +#define TX_TEST_IDX resp_prod /* aggressive: any outstanding responses? */ +#else +#define TX_TEST_IDX req_cons /* conservative: not seen all our requests? */ +#endif + +static void network_tx_buf_gc(struct net_device *dev); +static void network_alloc_rx_buffers(struct net_device *dev); + +static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE]; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; + +static struct list_head dev_list; + +struct net_private +{ + struct list_head list; + struct net_device *dev; + + struct net_device_stats stats; + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; + unsigned int tx_full; + + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + unsigned int handle; + unsigned int evtchn; + unsigned int irq; + + /* What is the status of our connection to the remote backend? */ +#define BEST_CLOSED 0 +#define BEST_DISCONNECTED 1 +#define BEST_CONNECTED 2 + unsigned int backend_state; + + /* Is this interface open or closed (down or up)? */ +#define UST_CLOSED 0 +#define UST_OPEN 1 + unsigned int user_state; + + /* Receive-ring batched refills. */ +#define RX_MIN_TARGET 8 +#define RX_MAX_TARGET NETIF_RX_RING_SIZE + int rx_target; + struct sk_buff_head rx_batch; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; +}; + +/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (void *)(unsigned long)(_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned long _id = (unsigned long)(_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) + +static char *status_name[] = { + [NETIF_INTERFACE_STATUS_CLOSED] = "closed", + [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [NETIF_INTERFACE_STATUS_CONNECTED] = "connected", + [NETIF_INTERFACE_STATUS_CHANGED] = "changed", +}; + +static char *be_state_name[] = { + [BEST_CLOSED] = "closed", + [BEST_DISCONNECTED] = "disconnected", + [BEST_CONNECTED] = "connected", +}; + +#if DEBUG +#define DPRINTK(fmt, args...) \ + printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTK(fmt, args...) ((void)0) +#endif +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_net: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_net: " fmt, ##args) + +static struct net_device *find_dev_by_handle(unsigned int handle) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + if ( np->handle == handle ) + return np->dev; + } + return NULL; +} + +/** Network interface info. */ +struct netif_ctrl { + /** Number of interfaces. */ + int interface_n; + /** Number of connected interfaces. */ + int connected_n; + /** Error code. */ + int err; + int up; +}; + +static struct netif_ctrl netctrl; + +static void netctrl_init(void) +{ + memset(&netctrl, 0, sizeof(netctrl)); + netctrl.up = NETIF_DRIVER_STATUS_DOWN; +} + +/** Get or set a network interface error. + */ +static int netctrl_err(int err) +{ + if ( (err < 0) && !netctrl.err ) + netctrl.err = err; + return netctrl.err; +} + +/** Test if all network interfaces are connected. + * + * @return 1 if all connected, 0 if not, negative error code otherwise + */ +static int netctrl_connected(void) +{ + int ok; + + if ( netctrl.err ) + ok = netctrl.err; + else if ( netctrl.up == NETIF_DRIVER_STATUS_UP ) + ok = (netctrl.connected_n == netctrl.interface_n); + else + ok = 0; + + return ok; +} + +/** Count the connected network interfaces. + * + * @return connected count + */ +static int netctrl_connected_count(void) +{ + + struct list_head *ent; + struct net_private *np; + unsigned int connected; + + connected = 0; + + list_for_each(ent, &dev_list) { + np = list_entry(ent, struct net_private, list); + if (np->backend_state == BEST_CONNECTED) + connected++; + } + + netctrl.connected_n = connected; + DPRINTK("> connected_n=%d interface_n=%d\n", + netctrl.connected_n, netctrl.interface_n); + return connected; +} + +/** Send a packet on a net device to encourage switches to learn the + * MAC. We send a fake ARP request. + * + * @param dev device + * @return 0 on success, error code otherwise + */ +static int vif_wake(struct net_device *dev) +{ + struct sk_buff *skb; + u32 src_ip, dst_ip; + + dst_ip = INADDR_BROADCAST; + src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, + dst_ip, dev, src_ip, + /*dst_hw*/ NULL, /*src_hw*/ NULL, + /*target_hw*/ dev->dev_addr); + if ( skb == NULL ) + return -ENOMEM; + + return dev_queue_xmit(skb); +} + +static int network_open(struct net_device *dev) +{ + struct net_private *np = dev->priv; + + memset(&np->stats, 0, sizeof(np->stats)); + + np->user_state = UST_OPEN; + + network_alloc_rx_buffers(dev); + np->rx->event = np->rx_resp_cons + 1; + + netif_start_queue(dev); + + return 0; +} + +static void network_tx_buf_gc(struct net_device *dev) +{ + NETIF_RING_IDX i, prod; + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + + if ( np->backend_state != BEST_CONNECTED ) + return; + + do { + prod = np->tx->resp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ + + for ( i = np->tx_resp_cons; i != prod; i++ ) + { + id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; + skb = np->tx_skbs[id]; + ADD_ID_TO_FREELIST(np->tx_skbs, id); + dev_kfree_skb_irq(skb); + } + + np->tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. Note + * that it is essential to schedule a callback, no matter how few + * buffers are pending. Even if there is space in the transmit ring, + * higher layers may be blocked because too much data is outstanding: + * in such cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->tx->event = + prod + ((np->tx->req_prod - prod) >> 1) + 1; + mb(); + } + while ( prod != np->tx->resp_prod ); + + if ( np->tx_full && + ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) ) + { + np->tx_full = 0; + if ( np->user_state == UST_OPEN ) + netif_wake_queue(dev); + } +} + + +static void network_alloc_rx_buffers(struct net_device *dev) +{ + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + int i, batch_target; + NETIF_RING_IDX req_prod = np->rx->req_prod; + + if ( unlikely(np->backend_state != BEST_CONNECTED) ) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory allocator, + * so should reduce the chance of failed allocation requests both for + * ourself and for other kernel subsystems. + */ + batch_target = np->rx_target - (req_prod - np->rx_resp_cons); + for ( i = skb_queue_len(&np->rx_batch); i < batch_target; i++ ) + { + if ( unlikely((skb = alloc_xen_skb(dev->mtu + RX_HEADROOM)) == NULL) ) + break; + __skb_queue_tail(&np->rx_batch, skb); + } + + /* Is the batch large enough to be worthwhile? */ + if ( i < (np->rx_target/2) ) + return; + + for ( i = 0; ; i++ ) + { + if ( (skb = __skb_dequeue(&np->rx_batch)) == NULL ) + break; + + skb->dev = dev; + + id = GET_ID_FROM_FREELIST(np->rx_skbs); + + np->rx_skbs[id] = skb; + + np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; + + rx_pfn_array[i] = virt_to_machine(skb->head) >> PAGE_SHIFT; + + /* Remove this page from pseudo phys map before passing back to Xen. */ + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] + = INVALID_P2M_ENTRY; + + rx_mcl[i].op = __HYPERVISOR_update_va_mapping; + rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + rx_mcl[i].args[1] = 0; + rx_mcl[i].args[2] = 0; + } + + /* + * We may have allocated buffers which have entries outstanding in the page + * update queue -- make sure we flush those first! + */ + flush_page_update_queue(); + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + rx_mcl[i-1].args[2] = UVMF_FLUSH_TLB; + + /* Give away a batch of pages. */ + rx_mcl[i].op = __HYPERVISOR_dom_mem_op; + rx_mcl[i].args[0] = MEMOP_decrease_reservation; + rx_mcl[i].args[1] = (unsigned long)rx_pfn_array; + rx_mcl[i].args[2] = (unsigned long)i; + rx_mcl[i].args[3] = 0; + rx_mcl[i].args[4] = DOMID_SELF; + + /* Tell the ballon driver what is going on. */ + balloon_update_driver_allowance(i); + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(rx_mcl, i+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if ( unlikely(rx_mcl[i].args[5] != i) ) + panic("Unable to reduce memory reservation\n"); + + /* Above is a suitable barrier to ensure backend will see requests. */ + np->rx->req_prod = req_prod + i; + + /* Adjust our floating fill target if we risked running out of buffers. */ + if ( ((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) && + ((np->rx_target *= 2) > RX_MAX_TARGET) ) + np->rx_target = RX_MAX_TARGET; +} + + +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned short id; + struct net_private *np = (struct net_private *)dev->priv; + netif_tx_request_t *tx; + NETIF_RING_IDX i; + + if ( unlikely(np->tx_full) ) + { + printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); + netif_stop_queue(dev); + goto drop; + } + + if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= + PAGE_SIZE) ) + { + struct sk_buff *nskb; + if ( unlikely((nskb = alloc_xen_skb(skb->len)) == NULL) ) + goto drop; + skb_put(nskb, skb->len); + memcpy(nskb->data, skb->data, skb->len); + nskb->dev = skb->dev; + dev_kfree_skb(skb); + skb = nskb; + } + + spin_lock_irq(&np->tx_lock); + + if ( np->backend_state != BEST_CONNECTED ) + { + spin_unlock_irq(&np->tx_lock); + goto drop; + } + + i = np->tx->req_prod; + + id = GET_ID_FROM_FREELIST(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req; + + tx->id = id; + tx->addr = virt_to_machine(skb->data); + tx->size = skb->len; + + wmb(); /* Ensure that backend will see the request. */ + np->tx->req_prod = i + 1; + + network_tx_buf_gc(dev); + + if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) ) + { + np->tx_full = 1; + netif_stop_queue(dev); + } + + spin_unlock_irq(&np->tx_lock); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Only notify Xen if we really have to. */ + mb(); + if ( np->tx->TX_TEST_IDX == i ) + notify_via_evtchn(np->evtchn); + + return 0; + + drop: + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + + +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct net_device *dev = dev_id; + struct net_private *np = dev->priv; + unsigned long flags; + + spin_lock_irqsave(&np->tx_lock, flags); + network_tx_buf_gc(dev); + spin_unlock_irqrestore(&np->tx_lock, flags); + + if ( (np->rx_resp_cons != np->rx->resp_prod) && + (np->user_state == UST_OPEN) ) + netif_rx_schedule(dev); + + return IRQ_HANDLED; +} + + +static int netif_poll(struct net_device *dev, int *pbudget) +{ + struct net_private *np = dev->priv; + struct sk_buff *skb, *nskb; + netif_rx_response_t *rx; + NETIF_RING_IDX i, rp; + mmu_update_t *mmu = rx_mmu; + multicall_entry_t *mcl = rx_mcl; + int work_done, budget, more_to_do = 1; + struct sk_buff_head rxq; + unsigned long flags; + + spin_lock(&np->rx_lock); + + if ( np->backend_state != BEST_CONNECTED ) + { + spin_unlock(&np->rx_lock); + return 0; + } + + skb_queue_head_init(&rxq); + + if ( (budget = *pbudget) > dev->quota ) + budget = dev->quota; + + rp = np->rx->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for ( i = np->rx_resp_cons, work_done = 0; + (i != rp) && (work_done < budget); + i++, work_done++ ) + { + rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + + /* + * An error here is very odd. Usually indicates a backend bug, + * low-memory condition, or that we didn't have reservation headroom. + */ + if ( unlikely(rx->status <= 0) ) + { + if ( net_ratelimit() ) + printk(KERN_WARNING "Bad rx buffer (memory squeeze?).\n"); + np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id; + wmb(); + np->rx->req_prod++; + work_done--; + continue; + } + + skb = np->rx_skbs[rx->id]; + ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + + /* NB. We handle skb overflow later. */ + skb->data = skb->head + (rx->addr & ~PAGE_MASK); + skb->len = rx->status; + skb->tail = skb->data + skb->len; + + np->stats.rx_packets++; + np->stats.rx_bytes += rx->status; + + /* Remap the page. */ + mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; + mmu->val = __pa(skb->head) >> PAGE_SHIFT; + mmu++; + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; + mcl->args[2] = 0; + mcl++; + + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = + rx->addr >> PAGE_SHIFT; + + __skb_queue_tail(&rxq, skb); + } + + /* Some pages are no longer absent... */ + balloon_update_driver_allowance(-work_done); + + /* Do all the remapping work, and M->P updates, in one big hypercall. */ + if ( likely((mcl - rx_mcl) != 0) ) + { + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl++; + (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + } + + while ( (skb = __skb_dequeue(&rxq)) != NULL ) + { + /* + * Enough room in skbuff for the data we were passed? Also, Linux + * expects at least 16 bytes headroom in each receive buffer. + */ + if ( unlikely(skb->tail > skb->end) || + unlikely((skb->data - skb->head) < 16) ) + { + nskb = NULL; + + /* Only copy the packet if it fits in the current MTU. */ + if ( skb->len <= (dev->mtu + ETH_HLEN) ) + { + if ( (skb->tail > skb->end) && net_ratelimit() ) + printk(KERN_INFO "Received packet needs %d bytes more " + "headroom.\n", skb->tail - skb->end); + + if ( (nskb = alloc_xen_skb(skb->len + 2)) != NULL ) + { + skb_reserve(nskb, 2); + skb_put(nskb, skb->len); + memcpy(nskb->data, skb->data, skb->len); + nskb->dev = skb->dev; + } + } + else if ( net_ratelimit() ) + printk(KERN_INFO "Received packet too big for MTU " + "(%d > %d)\n", skb->len - ETH_HLEN, dev->mtu); + + /* Reinitialise and then destroy the old skbuff. */ + skb->len = 0; + skb->tail = skb->data; + init_skb_shinfo(skb); + dev_kfree_skb(skb); + + /* Switch old for new, if we copied the buffer. */ + if ( (skb = nskb) == NULL ) + continue; + } + + /* Set the shared-info area, which is hidden behind the real data. */ + init_skb_shinfo(skb); + + /* Ethernet-specific work. Delayed to here as it peeks the header. */ + skb->protocol = eth_type_trans(skb, dev); + + /* Pass it up. */ + netif_receive_skb(skb); + dev->last_rx = jiffies; + } + + np->rx_resp_cons = i; + + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if ( ((np->rx->req_prod - np->rx->resp_prod) > ((3*np->rx_target) / 4)) && + (--np->rx_target < RX_MIN_TARGET) ) + np->rx_target = RX_MIN_TARGET; + + network_alloc_rx_buffers(dev); + + *pbudget -= work_done; + dev->quota -= work_done; + + if ( work_done < budget ) + { + local_irq_save(flags); + + np->rx->event = i + 1; + + /* Deal with hypervisor racing our resetting of rx_event. */ + mb(); + if ( np->rx->resp_prod == i ) + { + __netif_rx_complete(dev); + more_to_do = 0; + } + + local_irq_restore(flags); + } + + spin_unlock(&np->rx_lock); + + return more_to_do; +} + + +static int network_close(struct net_device *dev) +{ + struct net_private *np = dev->priv; + np->user_state = UST_CLOSED; + netif_stop_queue(np->dev); + return 0; +} + + +static struct net_device_stats *network_get_stats(struct net_device *dev) +{ + struct net_private *np = (struct net_private *)dev->priv; + return &np->stats; +} + + +static void network_connect(struct net_device *dev, + netif_fe_interface_status_t *status) +{ + struct net_private *np; + int i, requeue_idx; + netif_tx_request_t *tx; + + np = dev->priv; + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + + /* Recovery procedure: */ + + /* Step 1: Reinitialise variables. */ + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + np->rx->event = np->tx->event = 1; + + /* Step 2: Rebuild the RX and TX ring contents. + * NB. We could just free the queued TX packets now but we hope + * that sending them out might do some good. We have to rebuild + * the RX ring because some of our pages are currently flipped out + * so we can't just free the RX skbs. + * NB2. Freelist index entries are always going to be less than + * __PAGE_OFFSET, whereas pointers to skbs will always be equal or + * greater than __PAGE_OFFSET: we use this property to distinguish + * them. + */ + + /* Rebuild the TX buffer freelist and the TX ring itself. + * NB. This reorders packets. We could keep more private state + * to avoid this but maybe it doesn't matter so much given the + * interface has been down. + */ + for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ ) + { + if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET ) + { + struct sk_buff *skb = np->tx_skbs[i]; + + tx = &np->tx->ring[requeue_idx++].req; + + tx->id = i; + tx->addr = virt_to_machine(skb->data); + tx->size = skb->len; + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + } + } + wmb(); + np->tx->req_prod = requeue_idx; + + /* Rebuild the RX buffer freelist and the RX ring itself. */ + for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ ) + if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET ) + np->rx->ring[requeue_idx++].req.id = i; + wmb(); + np->rx->req_prod = requeue_idx; + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + np->backend_state = BEST_CONNECTED; + wmb(); + notify_via_evtchn(status->evtchn); + network_tx_buf_gc(dev); + + if ( np->user_state == UST_OPEN ) + netif_start_queue(dev); + + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); +} + +static void vif_show(struct net_private *np) +{ +#if DEBUG + if (np) { + IPRINTK("\n", + np->handle, + be_state_name[np->backend_state], + np->user_state ? "open" : "closed", + np->evtchn, + np->irq, + np->tx, + np->rx); + } else { + IPRINTK("\n"); + } +#endif +} + +/* Send a connect message to xend to tell it to bring up the interface. */ +static void send_interface_connect(struct net_private *np) +{ + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT, + .length = sizeof(netif_fe_interface_connect_t), + }; + netif_fe_interface_connect_t *msg = (void*)cmsg.msg; + + DPRINTK(">\n"); vif_show(np); + msg->handle = np->handle; + msg->tx_shmem_frame = (virt_to_machine(np->tx) >> PAGE_SHIFT); + msg->rx_shmem_frame = (virt_to_machine(np->rx) >> PAGE_SHIFT); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + DPRINTK("<\n"); +} + +/* Send a driver status notification to the domain controller. */ +static int send_driver_status(int ok) +{ + int err = 0; + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_DRIVER_STATUS, + .length = sizeof(netif_fe_driver_status_t), + }; + netif_fe_driver_status_t *msg = (void*)cmsg.msg; + + msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN); + err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + return err; +} + +/* Stop network device and free tx/rx queues and irq. + */ +static void vif_release(struct net_private *np) +{ + /* Stop old i/f to prevent errors whilst we rebuild the state. */ + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + netif_stop_queue(np->dev); + /* np->backend_state = BEST_DISCONNECTED; */ + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); + + /* Free resources. */ + if(np->tx != NULL){ + free_irq(np->irq, np->dev); + unbind_evtchn_from_irq(np->evtchn); + free_page((unsigned long)np->tx); + free_page((unsigned long)np->rx); + np->irq = 0; + np->evtchn = 0; + np->tx = NULL; + np->rx = NULL; + } +} + +/* Release vif resources and close it down completely. + */ +static void vif_close(struct net_private *np) +{ + DPRINTK(">\n"); vif_show(np); + WPRINTK("Unexpected netif-CLOSED message in state %s\n", + be_state_name[np->backend_state]); + vif_release(np); + np->backend_state = BEST_CLOSED; + /* todo: take dev down and free. */ + vif_show(np); DPRINTK("<\n"); +} + +/* Move the vif into disconnected state. + * Allocates tx/rx pages. + * Sends connect message to xend. + */ +static void vif_disconnect(struct net_private *np){ + DPRINTK(">\n"); + if(np->tx) free_page((unsigned long)np->tx); + if(np->rx) free_page((unsigned long)np->rx); + // Before this np->tx and np->rx had better be null. + np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); + np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); + memset(np->tx, 0, PAGE_SIZE); + memset(np->rx, 0, PAGE_SIZE); + np->backend_state = BEST_DISCONNECTED; + send_interface_connect(np); + vif_show(np); DPRINTK("<\n"); +} + +/* Begin interface recovery. + * + * NB. Whilst we're recovering, we turn the carrier state off. We + * take measures to ensure that this device isn't used for + * anything. We also stop the queue for this device. Various + * different approaches (e.g. continuing to buffer packets) have + * been tested but don't appear to improve the overall impact on + * TCP connections. + * + * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery + * is initiated by a special "RESET" message - disconnect could + * just mean we're not allowed to use this interface any more. + */ +static void +vif_reset( + struct net_private *np) +{ + DPRINTK(">\n"); + IPRINTK("Attempting to reconnect network interface: handle=%u\n", + np->handle); + vif_release(np); + vif_disconnect(np); + vif_show(np); DPRINTK("<\n"); +} + +/* Move the vif into connected state. + * Sets the mac and event channel from the message. + * Binds the irq to the event channel. + */ +static void +vif_connect( + struct net_private *np, netif_fe_interface_status_t *status) +{ + struct net_device *dev = np->dev; + DPRINTK(">\n"); + memcpy(dev->dev_addr, status->mac, ETH_ALEN); + network_connect(dev, status); + np->evtchn = status->evtchn; + np->irq = bind_evtchn_to_irq(np->evtchn); + (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, dev->name, dev); + netctrl_connected_count(); + vif_wake(dev); + vif_show(np); DPRINTK("<\n"); +} + + +/** Create a network device. + * @param handle device handle + * @param val return parameter for created device + * @return 0 on success, error code otherwise + */ +static int create_netdev(int handle, struct net_device **val) +{ + int i, err = 0; + struct net_device *dev = NULL; + struct net_private *np = NULL; + + if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL ) + { + printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__); + err = -ENOMEM; + goto exit; + } + + np = dev->priv; + np->backend_state = BEST_CLOSED; + np->user_state = UST_CLOSED; + np->handle = handle; + + spin_lock_init(&np->tx_lock); + spin_lock_init(&np->rx_lock); + + skb_queue_head_init(&np->rx_batch); + np->rx_target = RX_MIN_TARGET; + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) + np->tx_skbs[i] = (void *)(i+1); + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) + np->rx_skbs[i] = (void *)(i+1); + + dev->open = network_open; + dev->hard_start_xmit = network_start_xmit; + dev->stop = network_close; + dev->get_stats = network_get_stats; + dev->poll = netif_poll; + dev->weight = 64; + + if ( (err = register_netdev(dev)) != 0 ) + { + printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err); + goto exit; + } + np->dev = dev; + list_add(&np->list, &dev_list); + + exit: + if ( (err != 0) && (dev != NULL ) ) + kfree(dev); + else if ( val != NULL ) + *val = dev; + return err; +} + +/* Get the target interface for a status message. + * Creates the interface when it makes sense. + * The returned interface may be null when there is no error. + * + * @param status status message + * @param np return parameter for interface state + * @return 0 on success, error code otherwise + */ +static int +target_vif( + netif_fe_interface_status_t *status, struct net_private **np) +{ + int err = 0; + struct net_device *dev; + + DPRINTK("> handle=%d\n", status->handle); + if ( status->handle < 0 ) + { + err = -EINVAL; + goto exit; + } + + if ( (dev = find_dev_by_handle(status->handle)) != NULL ) + goto exit; + + if ( status->status == NETIF_INTERFACE_STATUS_CLOSED ) + goto exit; + if ( status->status == NETIF_INTERFACE_STATUS_CHANGED ) + goto exit; + + /* It's a new interface in a good state - create it. */ + DPRINTK("> create device...\n"); + if ( (err = create_netdev(status->handle, &dev)) != 0 ) + goto exit; + + netctrl.interface_n++; + + exit: + if ( np != NULL ) + *np = ((dev && !err) ? dev->priv : NULL); + DPRINTK("< err=%d\n", err); + return err; +} + +/* Handle an interface status message. */ +static void netif_interface_status(netif_fe_interface_status_t *status) +{ + int err = 0; + struct net_private *np = NULL; + + DPRINTK(">\n"); + DPRINTK("> status=%s handle=%d\n", + status_name[status->status], status->handle); + + if ( (err = target_vif(status, &np)) != 0 ) + { + WPRINTK("Invalid netif: handle=%u\n", status->handle); + return; + } + + if ( np == NULL ) + { + DPRINTK("> no vif\n"); + return; + } + + DPRINTK(">\n"); vif_show(np); + + switch ( status->status ) + { + case NETIF_INTERFACE_STATUS_CLOSED: + switch ( np->backend_state ) + { + case BEST_CLOSED: + case BEST_DISCONNECTED: + case BEST_CONNECTED: + vif_close(np); + break; + } + break; + + case NETIF_INTERFACE_STATUS_DISCONNECTED: + switch ( np->backend_state ) + { + case BEST_CLOSED: + vif_disconnect(np); + break; + case BEST_DISCONNECTED: + case BEST_CONNECTED: + vif_reset(np); + break; + } + break; + + case NETIF_INTERFACE_STATUS_CONNECTED: + switch ( np->backend_state ) + { + case BEST_CLOSED: + WPRINTK("Unexpected netif status %s in state %s\n", + status_name[status->status], + be_state_name[np->backend_state]); + vif_disconnect(np); + vif_connect(np, status); + break; + case BEST_DISCONNECTED: + vif_connect(np, status); + break; + } + break; + + case NETIF_INTERFACE_STATUS_CHANGED: + /* + * The domain controller is notifying us that a device has been + * added or removed. + */ + break; + + default: + WPRINTK("Invalid netif status code %d\n", status->status); + break; + } + vif_show(np); + DPRINTK("<\n"); +} + +/* + * Initialize the network control interface. + */ +static void netif_driver_status(netif_fe_driver_status_t *status) +{ + DPRINTK("> status=%d\n", status->status); + netctrl.up = status->status; + //netctrl.interface_n = status->max_handle; + //netctrl.connected_n = 0; + netctrl_connected_count(); +} + +/* Receive handler for control messages. */ +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + + switch ( msg->subtype ) + { + case CMSG_NETIF_FE_INTERFACE_STATUS: + if ( msg->length != sizeof(netif_fe_interface_status_t) ) + goto error; + netif_interface_status((netif_fe_interface_status_t *) + &msg->msg[0]); + break; + + case CMSG_NETIF_FE_DRIVER_STATUS: + if ( msg->length != sizeof(netif_fe_driver_status_t) ) + goto error; + netif_driver_status((netif_fe_driver_status_t *) + &msg->msg[0]); + break; + + error: + default: + msg->length = 0; + break; + } + + ctrl_if_send_response(msg); +} + + +#if 1 +/* Wait for all interfaces to be connected. + * + * This works OK, but we'd like to use the probing mode (see below). + */ +static int probe_interfaces(void) +{ + int err = 0, conn = 0; + int wait_i, wait_n = 100; + + DPRINTK(">\n"); + + for ( wait_i = 0; wait_i < wait_n; wait_i++) + { + DPRINTK("> wait_i=%d\n", wait_i); + conn = netctrl_connected(); + if(conn) break; + DPRINTK("> schedule_timeout...\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(10); + } + + DPRINTK("> wait finished...\n"); + if ( conn <= 0 ) + { + err = netctrl_err(-ENETDOWN); + WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err); + } + + DPRINTK("< err=%d\n", err); + + return err; +} +#else +/* Probe for interfaces until no more are found. + * + * This is the mode we'd like to use, but at the moment it panics the kernel. +*/ +static int probe_interfaces(void) +{ + int err = 0; + int wait_i, wait_n = 100; + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_INTERFACE_STATUS, + .length = sizeof(netif_fe_interface_status_t), + }; + netif_fe_interface_status_t msg = {}; + ctrl_msg_t rmsg = {}; + netif_fe_interface_status_t *reply = (void*)rmsg.msg; + int state = TASK_UNINTERRUPTIBLE; + u32 query = -1; + + DPRINTK(">\n"); + + netctrl.interface_n = 0; + for ( wait_i = 0; wait_i < wait_n; wait_i++ ) + { + DPRINTK("> wait_i=%d query=%d\n", wait_i, query); + msg.handle = query; + memcpy(cmsg.msg, &msg, sizeof(msg)); + DPRINTK("> set_current_state...\n"); + set_current_state(state); + DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); + DPRINTK("> sending...\n"); + err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state); + DPRINTK("> err=%d\n", err); + if(err) goto exit; + DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); + if((int)reply->handle < 0){ + // No more interfaces. + break; + } + query = -reply->handle - 2; + DPRINTK(">netif_interface_status ...\n"); + netif_interface_status(reply); + } + + exit: + if ( err ) + { + err = netctrl_err(-ENETDOWN); + WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err); + } + + DPRINTK("< err=%d\n", err); + return err; +} + +#endif + +static int __init netif_init(void) +{ + int err = 0; + + if ( (xen_start_info.flags & SIF_INITDOMAIN) || + (xen_start_info.flags & SIF_NET_BE_DOMAIN) ) + return 0; + + IPRINTK("Initialising virtual ethernet driver.\n"); + INIT_LIST_HEAD(&dev_list); + netctrl_init(); + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + send_driver_status(1); + err = probe_interfaces(); + if ( err ) + ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); + + DPRINTK("< err=%d\n", err); + return err; +} + +static void vif_suspend(struct net_private *np) +{ + // Avoid having tx/rx stuff happen until we're ready. + DPRINTK(">\n"); + free_irq(np->irq, np->dev); + unbind_evtchn_from_irq(np->evtchn); + DPRINTK("<\n"); +} + +static void vif_resume(struct net_private *np) +{ + // Connect regardless of whether IFF_UP flag set. + // Stop bad things from happening until we're back up. + DPRINTK(">\n"); + np->backend_state = BEST_DISCONNECTED; + memset(np->tx, 0, PAGE_SIZE); + memset(np->rx, 0, PAGE_SIZE); + + send_interface_connect(np); + DPRINTK("<\n"); +} + +void netif_suspend(void) +{ +#if 1 /* XXX THIS IS TEMPORARY */ + struct list_head *ent; + struct net_private *np; + + DPRINTK(">\n"); + list_for_each(ent, &dev_list){ + np = list_entry(ent, struct net_private, list); + vif_suspend(np); + } + DPRINTK("<\n"); +#endif +} + +void netif_resume(void) +{ +#if 1 + /* XXX THIS IS TEMPORARY */ + struct list_head *ent; + struct net_private *np; + + DPRINTK(">\n"); + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + vif_resume(np); + } + DPRINTK("<\n"); +#endif +} + + +__initcall(netif_init); diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/privcmd/Makefile tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/privcmd/Makefile --- pristine-linux-2.6.10-rc3/drivers/xen/privcmd/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/privcmd/Makefile 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,2 @@ + +obj-y := privcmd.o diff -Nurp pristine-linux-2.6.10-rc3/drivers/xen/privcmd/privcmd.c tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/privcmd/privcmd.c --- pristine-linux-2.6.10-rc3/drivers/xen/privcmd/privcmd.c 1970-01-01 01:00:00.000000000 +0100 +++ tmp-linux-2.6.10-rc3-xen.patch/drivers/xen/privcmd/privcmd.c 2004-12-08 00:52:40.000000000 +0000 @@ -0,0 +1,220 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static struct proc_dir_entry *privcmd_intf; + +static int privcmd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + + switch ( cmd ) + { + case IOCTL_PRIVCMD_HYPERCALL: + { + privcmd_hypercall_t hypercall; + + if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) + return -EFAULT; + + __asm__ __volatile__ ( + "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " + "movl 4(%%eax),%%ebx ;" + "movl 8(%%eax),%%ecx ;" + "movl 12(%%eax),%%edx ;" + "movl 16(%%eax),%%esi ;" + "movl 20(%%eax),%%edi ;" + "movl (%%eax),%%eax ;" + TRAP_INSTR "; " + "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" + : "=a" (ret) : "0" (&hypercall) : "memory" ); + + } + break; + + case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN: + { + extern int initdom_ctrlif_domcontroller_port; + ret = initdom_ctrlif_domcontroller_port; + } + break; + +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) + case IOCTL_PRIVCMD_MMAP: + { +#define PRIVCMD_MMAP_SZ 32 + privcmd_mmap_t mmapcmd; + privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p; + int i, rc; + + if ( copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd)) ) + return -EFAULT; + + p = mmapcmd.entry; + + for (i=0; iPRIVCMD_MMAP_SZ)? + PRIVCMD_MMAP_SZ:(mmapcmd.num-i); + if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) ) + return -EFAULT; + + for ( j = 0; j < n; j++ ) + { + struct vm_area_struct *vma = + find_vma( current->mm, msg[j].va ); + + if ( !vma ) + return -EINVAL; + + if ( msg[j].va > PAGE_OFFSET ) + return -EINVAL; + + if ( (msg[j].va + (msg[j].npages< vma->vm_end ) + return -EINVAL; + + if ( (rc = direct_remap_area_pages(vma->vm_mm, + msg[j].va&PAGE_MASK, + msg[j].mfn<vm_page_prot, + mmapcmd.dom)) < 0 ) + return rc; + } + } + ret = 0; + } + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + { +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v; + privcmd_mmapbatch_t m; + struct vm_area_struct *vma = NULL; + unsigned long *p, addr; + unsigned long mfn; + int i; + + if ( copy_from_user(&m, (void *)data, sizeof(m)) ) + { ret = -EFAULT; goto batch_err; } + + vma = find_vma( current->mm, m.addr ); + + if ( !vma ) + { ret = -EINVAL; goto batch_err; } + + if ( m.addr > PAGE_OFFSET ) + { ret = -EFAULT; goto batch_err; } + + if ( (m.addr + (m.num< vma->vm_end ) + { ret = -EFAULT; goto batch_err; } + + u[0].ptr = MMU_EXTENDED_COMMAND; + u[0].val = MMUEXT_SET_FOREIGNDOM; + u[0].val |= (unsigned long)m.dom << 16; + v = w = &u[1]; + + p = m.arr; + addr = m.addr; + for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ ) + { + if ( get_user(mfn, p) ) + return -EFAULT; + + v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot); + + __direct_remap_area_pages(vma->vm_mm, + addr, + PAGE_SIZE, + v); + + if ( unlikely(HYPERVISOR_mmu_update(u, v - u + 1, NULL) < 0) ) + put_user( 0xF0000000 | mfn, p ); + + v = w; + } + ret = 0; + break; + + batch_err: + printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", + ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end); + break; + } + break; +#endif + + case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: + { + unsigned long m2p_start_mfn = + HYPERVISOR_shared_info->arch.mfn_to_pfn_start; + + if( put_user( m2p_start_mfn, (unsigned long *) data ) ) + ret = -EFAULT; + else + ret = 0; + } + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) +{ + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + + return 0; +} + +static struct file_operations privcmd_file_ops = { + ioctl : privcmd_ioctl, + mmap: privcmd_mmap +}; + + +static int __init privcmd_init(void) +{ + if ( !(xen_start_info.flags & SIF_PRIVILEGED) ) + return 0; + + privcmd_intf = create_xen_proc_entry("privcmd", 0400); + if ( privcmd_intf != NULL ) + privcmd_intf->proc_fops = &privcmd_file_ops; + + return 0; +} + +__initcall(privcmd_init);