/* The aim here is to create a device, /dev/sk98, which you can open
 * and then do an ioctl.  The arguments to the ioctl include a range
 * of your address space, which is then made available as a spool
 * space for incoming packets.  It's a good idea to mlock() and then
 * touch the pages before doing so, though. */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/version.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/netdevice.h>
#include <asm/uaccess.h>
#include <asm/io.h>

#include "uspace_if.h"

#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10)
#define HAVE_PUD
#endif

static DECLARE_WAIT_QUEUE_HEAD(waitq);
static spinlock_t in_use_lock = SPIN_LOCK_UNLOCKED;
static int device_in_use;

static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED;
static spinlock_t free_skb_lock = SPIN_LOCK_UNLOCKED;

static int device_is_mapped;

/* We don't release skbs back in normal operation, but instead keep
   them in a pool. */
static struct sk_buff *skb_pool_head;

#ifdef REDHAT_IS_WEIRD
int put_user_size(unsigned int size, const void *val, void *ptr);
int get_user_size(unsigned int size, void *val, const void *ptr);

unsigned long copy_from_user(void *to, const __user void *from, unsigned long n)
{
	return get_user_size(n, to, from);
}
unsigned long copy_to_user(void *to, const __kernel void *from, unsigned long n)
{
	return put_user_size(n, to, from);
}

#endif

static struct sk98_map_area_header map_header __attribute__((aligned (4096)));
static unsigned max_expected_token;

/* Pages which the user has given us to deliver into.  We hold a
   reference to every page.  We also take a reference every time we
   start DMAing into a page, and release it when we're done. */
static unsigned nr_pages;
static struct page **user_pages;

/* The offset which the user wants us to use when delivering packets.
   The idea is to avoid any headers the user wants to add to the
   packets, so that e.g. you can do a single sector-aligned write when
   adding a packet to a pcap dump file. */
static unsigned packet_delivery_offset;

#define TOKEN_TO_OFF(x) ((x) * SK98_EXPECTED_PACKET_SIZE)
#define BAD_TOKEN ((unsigned)-1)

struct skb_private_data {
	unsigned token;
	unsigned tstamp;
	struct page *p;
};

#define SKB_TOKEN(s) (((struct skb_private_data *)(s)->cb)->token)
#define SKB_TSTAMP(s) (((struct skb_private_data *)(s)->cb)->tstamp)
#define SKB_PAGE(s) (((struct skb_private_data *)(s)->cb)->p)

#define ASSERT(x) do { if (!(x)) BUG(); } while (0)
//#define ASSERT(x)

static int open_method(struct inode *inode, struct file *file)
{
	spin_lock(&in_use_lock);
	if (device_in_use)
		return -EBUSY;
	device_in_use = 1;
	spin_unlock(&in_use_lock);
	return 0;
}

static int close_method(struct inode *inode, struct file *file)
{
	unsigned long flags;
	int x;

	if (device_is_mapped) {
		spin_lock_irqsave(&queue_lock, flags);
		device_is_mapped = 0;
		for (x = 0; x < nr_pages; x++) {
			put_page(user_pages[x]);
			user_pages[x] = NULL;
		}
		kfree(user_pages);
		user_pages = NULL;
		spin_unlock_irqrestore(&queue_lock, flags);
	}
	spin_lock(&in_use_lock);
	device_in_use = 0;
	spin_unlock(&in_use_lock);
	return 0;
}

static int get_token(struct sk_buff *skb)
{
	unsigned res;
	static unsigned message_rlimit;

	if (map_header.u2k_cons == map_header.u2k_prod) {
		if (message_rlimit == 0) {
			printk("<0>Warning: dropping packets due to lack of tokens.\n");
			message_rlimit = 10000;
		}
		map_header.drop_counter++;
		return -1;
	}
	if (message_rlimit > 0)
		message_rlimit--;
	ASSERT(irqs_disabled());
	res = map_header.u2k_tokens[map_header.u2k_cons % SK98_RING_SIZE];
	ASSERT(res >= 0 && res <= max_expected_token);
	rmb();
	map_header.u2k_cons++;
	SKB_PAGE(skb) = user_pages[(res * SK98_EXPECTED_PACKET_SIZE) / PAGE_SIZE];
	get_page(SKB_PAGE(skb));
	SKB_TOKEN(skb) = res;
	return 0;
}

static void complete_token(unsigned long token,
			   unsigned long ts,
			   unsigned len,
			   unsigned interface)
{
	unsigned ind;
	if (map_header.k2u_prod ==
	    atomic_read((atomic_t *)&map_header.k2u_cons) + SK98_RING_SIZE) {
		static unsigned done_message;
		if (!done_message)
			printk(KERN_WARNING
			       "sk98lin: overflowed k2u ring\n");
		done_message = 1;
		return;
	}
	if (token == 0xff160304 || token == 0xff170304) {
		printk(KERN_WARNING "sk98lin: returning poisoned token %lx\n",
		       token);
		return;
	}
	ind = map_header.k2u_prod % SK98_RING_SIZE;
	map_header.k2u_pipe[ind].token = token;
	map_header.k2u_pipe[ind].len = len;
	map_header.k2u_pipe[ind].tstamp = ts;
	map_header.k2u_pipe[ind].interface = interface;
	wmb();
	map_header.k2u_prod++;
	wake_up(&waitq);
}

static struct sk_buff *get_pool_skb(void)
{
	struct sk_buff *work;
	unsigned long flags;
	spin_lock_irqsave(&free_skb_lock, flags);
	work = skb_pool_head;
	if (work == NULL) {
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
	} else {
		skb_pool_head = work->next;
	}
	spin_unlock_irqrestore(&free_skb_lock, flags);
	return work;
}

static void release_skb(struct sk_buff *skb)
{
	unsigned long flags;
	spin_lock_irqsave(&free_skb_lock, flags);
	skb->next = skb_pool_head;
	skb_pool_head = skb;
	spin_unlock_irqrestore(&free_skb_lock, flags);
}

void NP_set_skb_timestamp(struct sk_buff *skb,
					unsigned long tstamp)
{
	SKB_TSTAMP(skb) = tstamp;
}

struct sk_buff *NP_dev_alloc_skb(unsigned size)
{
	struct sk_buff *work = NULL;
	static int suppress_message;
	unsigned long flags;

	ASSERT(size + packet_delivery_offset <= SK98_EXPECTED_PACKET_SIZE);
	spin_lock_irqsave(&queue_lock, flags);
	if (!device_is_mapped)
		goto out;
	mb();
	work = get_pool_skb();
	if (work == NULL) { /* Uh oh... */
		if (!suppress_message)
			printk("<0>Out of skbs?\n");
		suppress_message = 1000;
		map_header.drop_counter++;
		goto out;
	}
	if (suppress_message > 0)
		suppress_message--;
	size = SKB_DATA_ALIGN(size);
	memset(work, 0, offsetof(struct sk_buff, truesize));
	if (get_token(work) < 0) {
		/* Uh oh, about to drop a packet.  get_token has
		   already produced a warning. */
		release_skb(work);
		work = NULL;
		goto out;
	}
	work->truesize = size + sizeof(work[0]);
	atomic_set(&work->users, 1);
	work->head = NULL;
	work->data = NULL;
	work->tail = NULL;
	work->end = (void *)SK98_EXPECTED_PACKET_SIZE;
 out:
	spin_unlock_irqrestore(&queue_lock, flags);
	return work;
}

void NP_dev_kfree_skb(struct sk_buff *skb)
{
	ASSERT(skb->head == NULL);
	ASSERT(skb->end == (void *)SK98_EXPECTED_PACKET_SIZE);
	complete_token(SKB_TOKEN(skb), 0, 0, -1);
	put_page(SKB_PAGE(skb));
	SKB_PAGE(skb) = (void *)0x00120304;
	SKB_TOKEN(skb) = 0xff160304;
	release_skb(skb);
}

void NP_netif_rx(struct sk_buff *skb)
{
	ASSERT(skb->head == NULL);
	ASSERT(skb->end == (void *)SK98_EXPECTED_PACKET_SIZE);
	complete_token(SKB_TOKEN(skb),
		       SKB_TSTAMP(skb),
		       skb->len,
		       skb->dev->ifindex);
	put_page(SKB_PAGE(skb));
	SKB_PAGE(skb) = (void *)0x00150204;
	SKB_TOKEN(skb) = 0xff170304;
	release_skb(skb);
}

struct page *NP_skb_page(struct sk_buff *skb)
{
	return SKB_PAGE(skb);
}

unsigned long NP_skb_page_off(struct sk_buff *skb)
{
	unsigned token = SKB_TOKEN(skb);
	unsigned off = TOKEN_TO_OFF(token) % PAGE_SIZE;

	ASSERT(device_is_mapped);
	ASSERT(token >= 0 && token <= max_expected_token);
	ASSERT(TOKEN_TO_OFF(token) % SK98_EXPECTED_PACKET_SIZE == 0);
	return off + 8 + packet_delivery_offset;
}

static int ioctl_method(struct inode *ino, struct file *filp,
			unsigned command, unsigned long data)
{
	int r, x;

	switch (command) {
	case SK98_IOCTL_MAP:
	{
		struct sk98_ioctl_map m;
		if (copy_from_user(&m, (const void *)data, sizeof(m))) {
			return -EFAULT;
		}
		if (m.version != SK98_CURRENT_VERSION) {
			printk(KERN_DEBUG "Userspace expected sk98 version %d, but we only support version %d.\n",
			       m.version,
			       SK98_CURRENT_VERSION);
			return -EINVAL;
		}
		if (m.offset >= 256) {
			printk("<0>Attempt to set implausible packet delivery offset %x.\n",
			       m.offset);
			return -EINVAL;
		}
		packet_delivery_offset = m.offset;
		m.len &= ~(PAGE_SIZE-1);
		nr_pages = m.len / PAGE_SIZE;
		max_expected_token = m.len / SK98_EXPECTED_PACKET_SIZE - 1;
		user_pages = kmalloc(sizeof(*user_pages) * nr_pages,
				     GFP_KERNEL);
		if (user_pages == NULL) {
			return -ENOMEM;
		}
		printk("user_pages at %p, nr_pages %x, m.len %lx.\n", user_pages,
		       nr_pages, m.len);
		r = get_user_pages(current,
				   current->mm,
				   (unsigned long)m.start_addr,
				   nr_pages,
				   1,
				   0,
				   user_pages,
				   NULL);
		if (r < 0) {
			/* Tear down what we've got and return the error */
			for (x = 0; x < nr_pages; x++)
				if (user_pages[x] != NULL)
					page_cache_release(user_pages[x]);
			kfree(user_pages);
			return r;
		}

		/* Okay, so we've now got the pages ready for
		   delivery.  This should mean that there's no way for
		   us to fail from this point on, so set up the ring
		   control structures. */
		memset(&map_header, 0, sizeof(map_header));
		for (x = 0;
		     x < nr_pages * PAGE_SIZE / SK98_EXPECTED_PACKET_SIZE;
		     x++) {
			if (map_header.u2k_prod >= SK98_RING_SIZE - 1) {
				printk("can't use all tokens provided by userspace because there were too many");
				break;
			}
			map_header.u2k_tokens[map_header.u2k_prod++] = x;
		}
		mb();
		device_is_mapped = 1;
		return 0;
	}
	default:
		return -EINVAL;
	}
}

/* Urk */
static struct page *virt_to_page_full(unsigned long address)
{
        pgd_t *pgd = pgd_offset(current->mm, address);
        pmd_t *pmd;
	pte_t *pte;
	pte_t res;
	struct page *page;
#ifdef HAVE_PUD
	pud_t *pud;
#endif

        if (pgd_none(*pgd))
                panic("pgd is none?\n");
#ifdef HAVE_PUD
	pud = pud_offset(pgd, address);
	if (pud_none(*pud))
		panic("pud is none?\n");
        pmd = pmd_offset(pud, address);
#else
	pmd = pmd_offset(pgd, address);
#endif
        if (pmd_none(*pmd))
                panic("pmd is none?\n");
        if (pmd_large(*pmd))
		panic("Unexpected superpage\n");
	pte = pte_offset_map(pmd, address);
	if (!pte)
		panic("Map failed?\n");
	res = *pte;
	pte_unmap(pte);
        page = pte_page(res);
	return page;
}

static struct page *nopage_method(struct vm_area_struct *vma,
				  unsigned long address,
				  int *type)
{
	struct page *p;

	p = virt_to_page_full((unsigned long)&map_header + address - vma->vm_start);
	get_page(p); /* We really never want this page to go free, so
			just get it twice. */
	get_page(p);
	if (type)
		*type = VM_FAULT_MINOR;
	return p;
}

static struct vm_operations_struct vm_ops = {
	nopage:  nopage_method
};

static int mmap_method(struct file *file, struct vm_area_struct *vma)
{
	file_accessed(file);
	vma->vm_ops = &vm_ops;
	return 0;
}

static unsigned int poll_method(struct file *filp,
				struct poll_table_struct *pts)
{
	unsigned mask = 0;
	poll_wait(filp, &waitq, pts);
	if (atomic_read((atomic_t *)&map_header.k2u_prod) !=
	    map_header.k2u_cons)
		mask |= POLLIN;
	return mask;
}

struct file_operations file_ops = {
	open:    open_method,
	release: close_method,
	ioctl:   ioctl_method,
	mmap:    mmap_method,
	poll:    poll_method
};

static struct miscdevice miscdev = {
	minor: MISC_DYNAMIC_MINOR,
	name: "sk98",
	fops: &file_ops,
};

int sk98_uspace_if_init(void)
{
	return misc_register(&miscdev);
}

void sk98_uspace_if_cleanup(void)
{
	unsigned long flags;
	spin_lock_irqsave(&free_skb_lock, flags);
	while (skb_pool_head) {
		struct sk_buff *n;
		n = skb_pool_head->next;
		kfree(skb_pool_head);
		skb_pool_head = n;
	}
	spin_unlock_irqrestore(&free_skb_lock, flags);
	misc_deregister(&miscdev);
}
