/*
 *  FreeMWare: run multiple x86 operating systems concurrently
 *  Copyright (C) 1999  Kevin P. Lawton
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */


#include "freemware.h"
#include "monitor.h"

#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/wrapper.h>
#include <linux/version.h>
#include <asm/irq.h>

#ifndef VERSION_CODE
#  define VERSION_CODE(vers,rel,seq) ( ((vers)<<16) | ((rel)<<8) | (seq) )
#endif

#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
#  include <asm/uaccess.h>
#endif

#include <asm/io.h>


/************************************************************************/
/* Compatibility macros for older kernels                               */
/************************************************************************/

#ifndef EXPORT_NO_SYMBOLS
#  define EXPORT_NO_SYMBOLS register_symtab(NULL)
#endif

#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,29)
#  define proc_register_dynamic proc_register
#endif

#if LINUX_VERSION_CODE < VERSION_CODE(2,2,0)
#define NEED_RESCHED need_resched
#else
#define NEED_RESCHED current->need_resched
#endif

#if LINUX_VERSION_CODE < VERSION_CODE(2,1,0)
static inline unsigned long copy_from_user(void *to, const void *from, unsigned long n)
{
    int i;
    if ((i = verify_area(VERIFY_READ, from, n)) != 0)
        return i;
    memcpy_fromfs(to, from, n);
    return 0;
}
static inline unsigned long copy_to_user(void *to, const void *from, unsigned long n)
{
    int i;
    if ((i = verify_area(VERIFY_WRITE, to, n)) != 0)
        return i;
    memcpy_tofs(to, from, n);
    return 0;
}
#endif

#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,18) && !defined(THIS_MODULE)
/* Starting with version 2.1.18, the __this_module symbol is present,
   but the THIS_MODULE #define was introduced much later ... */
#define THIS_MODULE (&__this_module)
#endif


/************************************************************************/
/* Declarations                                                         */
/************************************************************************/

// Use this major # (experimental range) for now
#define FMW_MAJOR 63

// The kernel segment base
#if LINUX_VERSION_CODE < VERSION_CODE(2,1,0)
#  define KERNEL_OFFSET 0xc0000000
#else
#  define KERNEL_OFFSET 0x00000000
#endif


// File operations
static int fmw_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
static int fmw_open(struct inode *, struct file *);

#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
    static int fmw_release(struct inode *, struct file *);
#else
    static void fmw_release(struct inode *, struct file *);
#endif

#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
    static int fmw_mmap(struct file * file, struct vm_area_struct * vma);
#else
    static int fmw_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma);
#endif


// Freemware linux host functions
static void unalloc_vm_pages(vm_t *vm);
static int alloc_vm_pages(vm_t *vm, unsigned nmegs);
static void retrieve_monitor_pages(void);
static int run_guest_loop(vm_t *vm);



/************************************************************************/
/* Structures / Variables                                               */
/************************************************************************/

monitor_pages_t monitor_pages;
static unsigned redir_cnt[256];
static struct file_operations fmw_fops;

// For the /proc/freemware entry
int fmw_read_procmem(char *, char **, off_t, int, int);

#if LINUX_VERSION_CODE < VERSION_CODE(2,3,25)
static struct proc_dir_entry fmw_proc_entry = {
    0,                  // dynamic inode
    9, "freemware",     // len, name
    S_IFREG | S_IRUGO,  // mode
    1, 0, 0,
    0,
    NULL,
    &fmw_read_procmem,  // read function
};
#endif


/************************************************************************/
/* Main kernel module code                                              */
/************************************************************************/

int
init_module(void)
{
    int result;

    // we are here !
    //printk(KERN_WARNING "freemware: initialising kernel module\n");

    // clear uninitialised structures
    memset(redir_cnt, 0, sizeof(redir_cnt));
    memset(&monitor_pages, 0, sizeof(monitor_pages));

    // fill in the file operation entries we support
    fmw_fops.mmap    = fmw_mmap;
    fmw_fops.ioctl   = fmw_ioctl;
    fmw_fops.open    = fmw_open;
    fmw_fops.release = fmw_release;

    // register the device with the kernel
    result = register_chrdev(FMW_MAJOR, "freemware", &fmw_fops);
    if (result < 0) {
        printk(KERN_WARNING "freemware: can't get major %d\n", FMW_MAJOR);
        return(result);
    }

    // register the /proc entry
    #ifdef CONFIG_PROC_FS
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
    if (!create_proc_info_entry("freemware", 0, NULL, fmw_read_procmem))
      printk(KERN_ERR "freemware: registering /proc/freewmare failed\n");
#else
    proc_register_dynamic(&proc_root, &fmw_proc_entry);
#endif
    #endif

    // retrieve the monitor physical pages
    retrieve_monitor_pages();

    // success
    EXPORT_NO_SYMBOLS;
    return(0);
}

void
cleanup_module(void)
{
    // we are here !!!
    //printk(KERN_WARNING "freemware: cleaning up kernel module\n");

    // unregister device
    unregister_chrdev(FMW_MAJOR, "freemware");

    // unregister /proc entry
    #ifdef CONFIG_PROC_FS
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
    remove_proc_entry("freemware", NULL);
#else
    proc_unregister(&proc_root, fmw_proc_entry.low_ino);
#endif
    #endif
}



/************************************************************************/
/* Open / Release a VM                                                  */
/************************************************************************/

int
fmw_open(struct inode *inode, struct file *filp)
{
    vm_t *vm;
    MOD_INC_USE_COUNT;

    // allocate a VM structure
    if ( (vm = kmalloc(sizeof(*vm), GFP_KERNEL)) == NULL )
        return -ENOMEM;
    
    memset( vm, 0, sizeof(*vm) );
    filp->private_data = vm;

    return(0);
}


#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
int
#else
void
#endif
fmw_release(struct inode *inode, struct file *filp)
{
    vm_t *vm = (vm_t *)filp->private_data;
    filp->private_data = NULL;

    // free the virtual memory
    unalloc_vm_pages( vm );

    // free the VM structure
    memset( vm, 0, sizeof(*vm) );
    kfree( vm );

    MOD_DEC_USE_COUNT;
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
    return(0);
#endif
}



/************************************************************************/
/* VM operations:  ioctl() and mmap()                                   */
/************************************************************************/

int
fmw_ioctl(struct inode *inode, struct file *filp,
              unsigned int cmd, unsigned long arg)
{
    vm_t *vm = (vm_t *)filp->private_data;
    guest_context_t context;
    int ret;

    switch (cmd) {
	/*
         *  Allocate unpaged memory for the VM.
         *  arg is the number of megabytes to allocate
         *  Memory returned must not be pageable by the
         *  host OS, since the VM monitor will run in this
         *  memory as well.  Perhaps later, we can let
         *  the guest OS run in paged memory and reflect
         *  the page faults back to the host OS.
         */
    case FMWALLOCVPHYS:
      //printk(KERN_WARNING "freemware: allocating %luM of memory\n", arg);

      // Do not allow duplicate allocation;
      if (vm->pages.guest_n_megs != 0) {
	printk(KERN_WARNING "freemware: VM already in use\n");
	return (-EBUSY);
      }

      // Check that the amount of memory we allocate is reasonable
      if (arg > FMW_MAX_PHY_MEGS) {
	printk(KERN_WARNING "freemware: request of too much phymem (%luMegs)\n", arg);
	return (-EINVAL);
      }
      if (arg < 4) {
	printk(KERN_WARNING "freemware: request of too little phymem (%luMegs)\n", arg);
	return (-EINVAL);
      }

      // Check that the amount of memory we allocate is aligned correctly
      if ( (arg & ~0x3) != arg ) {
	printk(KERN_WARNING "freemware: mem request of "
	       "%luMegs not multiple of 4\n", arg);
	return (-EINVAL);
      }

      // ask linux for the memory;
      ret = alloc_vm_pages(vm, arg);
      if (ret) {
	return(ret);
      }

      // now we can initialize the monitor
      if ( init_monitor(vm) != 0 ) {
        return(-EINVAL); // +++ what error code to use?
        }

      // success;
      return(0);
      break;

    case FMWALLOCINT:
      // allocate an interrupt for forwarding to the user monitor
      // check that we allocate a valid interrupt
      if (arg > 256) {
	printk(KERN_WARNING "freemware: request forwarding of non-existant interrupt (%lu)\n", arg);
	return (-EINVAL);
      }

      // allocate the interrupt itself
      BMAP_SET(vm->addr.nexus->host_fwd_ints,arg);

      // success;
      return(0);
      break;

    case FMWRELEASEINT:
      // release an interrupt for forwarding to the user monitor
      // check that we release a valid interrupt
      if (arg > 256) {
	printk(KERN_WARNING "freemware: request release of non-existant interrupt (%lu)\n", arg);
	return (-EINVAL);
      }

      // release the interrupt itself
      BMAP_CLR(vm->addr.nexus->host_fwd_ints,arg);

      // success;
      return(0);
      break;

    case FMWRESET:
      // for debugging, when the module gets hosed, this is a way
      // to reset the in-use count, so we can rmmod it.
      while (MOD_IN_USE) {
	MOD_DEC_USE_COUNT;
      }
      MOD_INC_USE_COUNT; // bump back to 1 so release can decrement
      return(0);
      break;

      // (re)start guest context and run it until it hits an event
      // that needs to be handled by the host monitor code
    case FMWRUNGUEST:
      if (!vm->pages.guest_n_megs || !vm->initialized) {
	return(-EPERM);
      }

      if (copy_from_user( &context, (void *)arg, sizeof(context) ))
        return -EFAULT;

      set_guest_context( vm, &context );
      ret = run_guest_loop( vm );
      get_guest_context( vm, &context );

      if (copy_to_user( (void *)arg, &context, sizeof(context) ))
        return -EFAULT;

      return ret;
      break;

    case FMWTEARDOWN: // tear down VM environment

      // Do *not* free pages that are still mapped to user space!
      if (inode->i_mmap != NULL) {
	printk(KERN_WARNING "freemware: guest memory is still mapped!\n");
	return -EBUSY;
      }

      unalloc_vm_pages( vm );
      return(0);
      break;

    default:
      printk(KERN_WARNING "freemware: unknown ioctl(%d) called\n", cmd);
      return (-EINVAL);
    }
}


int
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
fmw_mmap(struct file * file, struct vm_area_struct * vma)
#else
fmw_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
#endif
{
    vm_t *vm = (vm_t *)file->private_data;
    int i, firstpage, nr_pages;

    /* Must have memory allocated */
    if (!vm->pages.guest_n_pages) {
        printk(KERN_WARNING "freemware: device not initialized\n");
        return -EACCES;
    }

    /* Private mappings make no sense ... */
    if ( !(vma->vm_flags & VM_SHARED) ) {
        printk(KERN_WARNING "freemware: private mapping\n");
        return -EINVAL;
    }

#if LINUX_VERSION_CODE < VERSION_CODE(2,3,25)
    /* To simplify things, allow only page-aligned offsets */
    if ( vma->vm_offset & (PAGE_SIZE - 1) ) {
        printk(KERN_WARNING "freemware: unaligned offset %08lx\n", vma->vm_offset);
        return -EINVAL;
    }
#endif

#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
    if (    vma->vm_pgoff < 0
         || vma->vm_pgoff + ((vma->vm_end - vma->vm_start) / PAGE_SIZE)
          > vm->pages.guest_n_pages)
    {
        printk(KERN_WARNING "freemware: offset page %08lx out of range\n", vma->vm_pgoff);
        return -EINVAL;
    }
#else
    /* Sanity check */
    if (    vma->vm_offset < 0
         || vma->vm_offset + (vma->vm_end - vma->vm_start)
          > vm->pages.guest_n_pages * PAGE_SIZE )
    {
        printk(KERN_WARNING "freemware: offset %08lx out of range\n", vma->vm_offset);
        return -EINVAL;
    }
#endif

    /* Map all requested guest pages in ... */
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
    firstpage = vma->vm_pgoff;
#else
    firstpage = vma->vm_offset / PAGE_SIZE;
#endif
    nr_pages  = (vma->vm_end - vma->vm_start) / PAGE_SIZE;
    for ( i = 0; i < nr_pages; i++ )
        if ( remap_page_range( vma->vm_start + i*PAGE_SIZE,
                               vm->pages.guest[firstpage+i] << 12,
                               PAGE_SIZE,
                               vma->vm_page_prot ) )
        return -EAGAIN;

#if LINUX_VERSION_CODE < VERSION_CODE(2,1,0)
    /* Enter our inode into the VMA; no need to change the default ops */
    vma->vm_inode = inode;
    inode->i_count++;
#endif
    return 0;
}



/************************************************************************/
/* Status reporting:  /proc code                                        */
/************************************************************************/

int
fmw_read_procmem(char *buf, char **start, off_t offset,
                 int len, int unused)
{
    unsigned i;
    len = 0;
    len += sprintf(buf, "monitor-->host interrupt reflection counts\n");
    for (i=0; i<256; i++) {
    if (redir_cnt[i])
        len += sprintf(buf+len, "  0x%2x:%10u\n", i, redir_cnt[i]);
    }
    return(len);
}


/************************************************************************/
/* VM main loop                                                         */
/************************************************************************/

static int
run_guest_loop( vm_t *vm )
{
    unsigned vector, event;

    for (;;)
    {
        unsigned long eflags;

        save_flags(eflags);
        restore_flags(eflags & ~0x00004300); // clear NT/IF/TF

        host2guest(vm);

        vector = vm->guest_context->event_info & 0xff;
        event  = (vm->guest_context->event_info >> 8) & 0xff;

        if ( event == RET_BECAUSE_REDIR )
        {
            restore_flags(eflags & ~0x00000200); // restore all but IF
            soft_int(vector);
            redir_cnt[vector]++;

            // Check whether we have any more CPU time
            if (NEED_RESCHED)
                schedule();

            // Unless we need to return to handle pending signals, restart VM
            if (!current_got_fatal_signal())
                continue;
        }
        else
        {
            restore_flags(eflags);
        }


        // Display monitor emulation error messages
        if ( event == RET_BECAUSE_EMERR )
            switch (vm->addr.nexus->debug_msg.msg_code) 
            {
            case EMU_CLI_MSG:
              printk(KERN_WARNING "freemware: emulation of cli without success\n");
              break;
            case EMU_STI_MSG:
              printk(KERN_WARNING "freemware: emulation of sti without success\n");
              break;
            case EMU_CLTS_MSG:
              printk(KERN_WARNING "freemware: emulation of clts without success\n");
              break;
            case EMU_LOAD_SEGREG_MSG:
              printk(KERN_WARNING "freemware: trying to load %04x into %cS failed\n",
                     vm->addr.nexus->debug_msg.para2, vm->addr.nexus->debug_msg.para1);
              break;

            default:
              printk(KERN_WARNING "freemware: unknown emulation error (%d) [0x%x : 0x%x]\n",
                     vm->addr.nexus->debug_msg.msg_code,
                     vm->addr.nexus->debug_msg.para1, vm->addr.nexus->debug_msg.para2);
              break;
            }

        return 0;
    }
}

/************************************************************************/
/* Paging management                                                    */
/************************************************************************/

int
retrieve_vm_pages(Bit32u *page, int max_pages, void *addr, unsigned size)
{
    /*  
     * Grrr.  There doesn't seem to be an exported mechanism to retrieve
     * the physical pages underlying a vmalloc()'ed area.  We do it the
     * hard way ... 
     */
    pageEntry_t *host_pgd;
    u32 host_cr3;
    u32 start_addr = (u32)addr & ~(PAGESIZE-1);
    int n_pages = ((u32)addr + size - start_addr + PAGESIZE-1) >> 12;
    int i;

    asm volatile("movl %%cr3, %0" : "=r" (host_cr3));
    host_pgd = (pageEntry_t *)(phys_to_virt(host_cr3 & ~0xfff));

    for (i = 0; i < n_pages; i++)
    {
        u32 virt_addr = start_addr + i*PAGESIZE + KERNEL_OFFSET;
        pageEntry_t *pde = host_pgd + (virt_addr >> 22);
        pageEntry_t *pte = (pageEntry_t *)phys_to_virt(pde->base << 12)
                         + ((virt_addr >> 12) & 0x3ff);

        /* If page isn't present, assume end of area */
        if ( !pde->P || ! pte->P )
        {
            n_pages = i;
            break;
        }
        
        /* Abort if our page list is too small */
        if (i >= max_pages)
        {
            printk(KERN_WARNING "freemware: page list is too small!\n");
            return 0;
        }

        page[i] = pte->base;
    }

    return n_pages;
}

void
retrieve_monitor_pages(void)
{
    /* 
     * Retrieve start address and size of this module.
     *
     * Note that with old kernels, we cannot access the module info (size),
     * hence we rely on the fact that Linux lets at least one page of 
     * virtual address space unused after the end of the module.
     */
#ifdef THIS_MODULE
    void *start_addr = THIS_MODULE;
    unsigned size    = THIS_MODULE->size;
#else
    void *start_addr = &mod_use_count_;
    unsigned size    = 0x10000000;  /* Actual size determined below */
#endif

    int n_pages = retrieve_vm_pages(monitor_pages.page, FMW_MAX_MONITOR_PAGES,
                                    start_addr, size);

    monitor_pages.start_addr = (u32)start_addr;
    monitor_pages.n_pages    = n_pages;
}

void
unalloc_vm_pages( vm_t *vm )
{
    vm_pages_t *pg = &vm->pages;
    vm_addr_t  *ad = &vm->addr;
    unsigned p;

    //printk(KERN_WARNING "freemware: unallocating VM pages\n");

    // Guest physical memory pages
    if (ad->guest)
    {
        /* Remove the PG_reserved flags before returning the pages */
        for (p = 0; p < pg->guest_n_pages; p++) 
            mem_map_unreserve(pg->guest[p]);

        vfree(ad->guest);
    }


    // Monitor page directory
    if (ad->page_dir) free_page((u32)ad->page_dir);

    // Monitor page tables
    for (p = 0; p < MON_PAGE_TABLES; p++) 
        if (ad->page_tbl[p]) free_page((u32)ad->page_tbl[p]);

    // Nexus page table
    if (ad->nexus_page_tbl) free_page((u32)ad->nexus_page_tbl);

    // Transition page table
    if (ad->transition_PT) free_page((u32)ad->transition_PT);

    // Nexus page
    if (ad->nexus) free_page((u32)ad->nexus);

    // Monitor IDT
    if (ad->idt) vfree(ad->idt);

    // Monitor GDT
    if (ad->gdt) vfree(ad->gdt);

    // Monitor LDT
    if (ad->ldt) vfree(ad->ldt);

    // Monitor TSS
    if (ad->tss) vfree(ad->tss);

    // Monitor IDT stubs
    if (ad->idt_stubs) vfree(ad->idt_stubs);


    // clear out allocated pages lists
    memset(pg, 0, sizeof(*pg));
    memset(ad, 0, sizeof(*ad));
}


int
alloc_vm_pages( vm_t *vm, unsigned nmegs )
{
    vm_pages_t *pg = &vm->pages;
    vm_addr_t  *ad = &vm->addr;
    unsigned p;

    // clear out allocated pages lists
    memset(pg, 0, sizeof(*pg));
    memset(ad, 0, sizeof(*ad));


    // Guest physical memory pages
    pg->guest_n_megs  = nmegs;
    pg->guest_n_pages = nmegs * 256;

    ad->guest = vmalloc(pg->guest_n_megs * 1024 * 1024);
    if (!ad->guest) goto error;
    retrieve_vm_pages(pg->guest, MON_GUEST_PAGES, 
                      ad->guest, pg->guest_n_megs * 1024 * 1024);

    /* 
     * As we want to map these pages to user space, we need to mark
     * them as 'reserved' pages by setting the PG_reserved bit.
     *
     * This has the effect that:
     *  - remap_page_range accepts them as candidates for remapping
     *  - the swapper does *not* try to swap these pages out, even
     *    after they are mapped to user space
     */
    for (p = 0; p < pg->guest_n_pages; p++) 
        mem_map_reserve(pg->guest[p]);


    // Monitor page directory
    ad->page_dir = (pageEntry_t *)get_free_page(GFP_KERNEL);
    pg->page_dir = MAP_NR(ad->page_dir);
    if (!ad->page_dir) goto error;

    // Monitor page tables
    for (p = 0; p < MON_PAGE_TABLES; p++) 
    {
        ad->page_tbl[p] = (pageEntry_t *)get_free_page(GFP_KERNEL);
        pg->page_tbl[p] = MAP_NR(ad->page_tbl[p]);
        if (!ad->page_tbl[p]) goto error;
    }

    // Nexus page table
    ad->nexus_page_tbl = (pageEntry_t *)get_free_page(GFP_KERNEL);
    pg->nexus_page_tbl = MAP_NR(ad->nexus_page_tbl);
    if (!ad->nexus_page_tbl) goto error;

    // Transition page table
    ad->transition_PT = (pageEntry_t *)get_free_page(GFP_KERNEL);
    pg->transition_PT = MAP_NR(ad->transition_PT);
    if (!ad->transition_PT) goto error;

    // Nexus page
    ad->nexus = (nexus_t *)get_free_page(GFP_KERNEL);
    pg->nexus = MAP_NR(ad->nexus);
    if (!ad->nexus) goto error;

    // Monitor IDT
    ad->idt = vmalloc(MON_IDT_SIZE);
    if (!ad->idt) goto error;
    retrieve_vm_pages(pg->idt, MON_IDT_PAGES, ad->idt, MON_IDT_SIZE);

    // Monitor GDT
    ad->gdt = vmalloc(MON_GDT_SIZE);
    if (!ad->gdt) goto error;
    retrieve_vm_pages(pg->gdt, MON_GDT_PAGES, ad->gdt, MON_GDT_SIZE);

    // Monitor LDT
    ad->ldt = vmalloc(MON_LDT_SIZE);
    if (!ad->ldt) goto error;
    retrieve_vm_pages(pg->ldt, MON_LDT_PAGES, ad->ldt, MON_LDT_SIZE);

    // Monitor TSS
    ad->tss = vmalloc(MON_TSS_SIZE);
    if (!ad->tss) goto error;
    retrieve_vm_pages(pg->tss, MON_TSS_PAGES, ad->tss, MON_TSS_SIZE);

    // Monitor IDT stubs
    ad->idt_stubs = vmalloc(MON_IDT_STUBS_SIZE);
    if (!ad->idt_stubs) goto error;
    retrieve_vm_pages(pg->idt_stubs, MON_IDT_STUBS_PAGES, 
                      ad->idt_stubs, MON_IDT_STUBS_SIZE);

    // Fill in kernel segment offset
    vm->kernel_offset = KERNEL_OFFSET;
    return 0;

 error:
    unalloc_vm_pages( vm );
    return -ENOMEM;
}

/* The end */
