/*
**  pth_sched.c -- Pth thread scheduler
**
**  Copyright (c) 1999 Ralf S. Engelschall <rse@engelschall.com>
**
**  This file is part of GNU Pth, a non-preemptive thread scheduling
**  library which can be found at http://www.gnu.org/software/pth/.
**
**  This library is free software; you can redistribute it and/or
**  modify it under the terms of the GNU Lesser General Public
**  License as published by the Free Software Foundation; either
**  version 2 of the License, or (at your option) any later version.
**
**  This library is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
**  Lesser General Public License for more details.
**
**  You should have received a copy of the GNU Lesser General Public
**  License along with this library; if not, write to the Free Software
**  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
**  USA, or contact Ralf S. Engelschall <rse@engelschall.com>.
*/

#include "pth_p.h"

intern pth_t        pth_main;       /* the main thread                       */
intern pth_t        pth_sched;      /* the permanent scheduler thread        */
intern pth_t        pth_current;    /* the currently running thread          */
intern pth_pqueue_t pth_NQ;         /* queue of new threads                  */
intern pth_pqueue_t pth_RQ;         /* queue of threads ready to run         */
intern pth_pqueue_t pth_WQ;         /* queue of threads waiting for an event */
intern pth_pqueue_t pth_DQ;         /* queue of terminated threads           */

int          pth_sigraised;/* has SIGIO been raised?                */

/*
 * Signal pipe variables -- avoid nasty race to block!
 */
static int pth_sigpipe[2];
static int pth_sigfdwrite;

#define RT_SIG_NUM 50

intern void pth_sched_eventmanager_sighandler(int sig, 
                                              siginfo_t *info, 
                                              void *c);

/* initialize the scheduler ingredients */
intern void pth_scheduler_init(void)
{
    pth_sigfdwrite = FALSE;
    /* create the internal signal pipe */
    if (pipe(pth_sigpipe) == -1) {
        perror("pipe");
        abort();
    }
    pth_fdmode(pth_sigpipe[0], PTH_FDMODE_NONBLOCK);
    pth_fdmode(pth_sigpipe[1], PTH_FDMODE_NONBLOCK);

    pth_sigraised = FALSE;

    /* initialize the essential threads */
    pth_sched   = NULL;
    pth_current = NULL;

    /* initalize the thread queues */
    pth_pqueue_init(&pth_NQ);
    pth_pqueue_init(&pth_RQ);
    pth_pqueue_init(&pth_WQ);
    pth_pqueue_init(&pth_DQ);
}

/* drop all threads (except for the currently active one) */
intern void pth_scheduler_drop(void)
{
    pth_t t;

    /* clear the new queue */
    while ((t = pth_pqueue_delmax(&pth_NQ)) != NULL);
        pth_tcb_free(t);
    pth_pqueue_init(&pth_NQ);

    /* clear the ready queue */
    while ((t = pth_pqueue_delmax(&pth_RQ)) != NULL);
        pth_tcb_free(t);
    pth_pqueue_init(&pth_RQ);

    /* clear the waiting queue */
    while ((t = pth_pqueue_delmax(&pth_WQ)) != NULL);
        pth_tcb_free(t);
    pth_pqueue_init(&pth_WQ);

    /* clear the dead queue */
    while ((t = pth_pqueue_delmax(&pth_DQ)) != NULL);
        pth_tcb_free(t);
    pth_pqueue_init(&pth_DQ);
    return;
}

/* kill the scheduler ingredients */
intern void pth_scheduler_kill(void)
{
    /* drop all threads */
    pth_scheduler_drop();

    /* remove the internal signal pipe */
    close(pth_sigpipe[0]);
    close(pth_sigpipe[1]);
} 


/* the heart of this library: the thread scheduler */
intern void *pth_scheduler(void *dummy)
{
    pth_time_t running, snapshot;
    struct sigaction sa;
    pth_t t;

    /* mark this thread as the special scheduler thread */
    pth_sched->state = PTH_STATE_SCHEDULER;

    /* initialize the snapshot time for bootstrapping the loop */
    pth_time_set(&snapshot, PTH_TIME_NOW);

    if ( set_rt_sighandler(RT_SIG_NUM, pth_sched_eventmanager_sighandler) < 0 )
    {
        printf("Couldn't set rt signal handler\n");
        exit(1);
    }

    for ( ; ; ) 
    {
        /*
         * Move threads from new queue to ready queue and give
         * them maximum priority so they start immediately
         */
        while ( t = pth_pqueue_delmax(&pth_NQ) ) 
        {
            t->state = PTH_STATE_READY;
            pth_pqueue_insert(&pth_RQ, pth_pqueue_favorite_prio(&pth_RQ), t);
        }

        /* Find next thread in ready queue */
        if ( (pth_current = pth_pqueue_delmax(&pth_RQ)) == NULL ) goto wait;

        /*
         * Set running start time for new thread
         * and perform a context switch to it
         */
        /* update thread times */
        //pth_time_set(&pth_current->lastran, PTH_TIME_NOW);

        /* update scheduler times */
        //pth_time_set(&running, &pth_current->lastran);
        //pth_time_sub(&running, &snapshot);
        //pth_time_add(&pth_sched->running, &running);

        /* ** ENTERING THREAD ** - by switching the machine context */
        pth_mctx_switch(&pth_sched->mctx, &pth_current->mctx);

        /* update scheduler times */
        pth_time_set(&snapshot, PTH_TIME_NOW);

        /* Calculate and update the time the previous thread was running */
        //pth_time_set(&running, &snapshot);
        //pth_time_sub(&running, &pth_current->lastran);
        //pth_time_add(&pth_current->running, &running);

        if ( pth_current->state == PTH_STATE_DEAD ) 
        {
            if ( !pth_current->joinable )
            {
                pth_tcb_free(pth_current);
            }
            else
            {
                pth_pqueue_insert(&pth_DQ, PTH_PRIO_STD, pth_current);
            }
            pth_current = NULL;
        } 
        else if ( pth_current->state == PTH_STATE_WAITING ) 
        {
            pth_pqueue_insert(&pth_WQ, pth_current->prio, pth_current);
            pth_current = NULL;
        }

        /*
         * migrate old threads in ready queue into higher
         * priorities to avoid starvation and insert last running
         * thread back into this queue, too.
         */
        pth_pqueue_increase(&pth_RQ);
        if ( pth_current != NULL )
        {
            pth_pqueue_insert(&pth_RQ, pth_current->prio, pth_current);
        }

    wait:
        /*
         * Manage the events in the waiting queue, i.e. decide whether their
         * events occurred and move them to the ready queue. But wait only when
         * we've already no new or ready threads.
         */
        pth_sched_eventmanager(&snapshot, 
                               pth_pqueue_elements(&pth_RQ) != 0 ||
                               pth_pqueue_elements(&pth_NQ) != 0);
    }

    return NULL;
}

/*
 * Look whether some events already occurred and move
 * corresponding threads from waiting queue back to ready queue.
 */
intern void pth_sched_eventmanager(pth_time_t *now, int dopoll)
{
    pth_t nexttimer_thread = NULL;
    pth_event_t nexttimer_ev;
    pth_time_t nexttimer_value;
    pth_event_t evh, ev;
    pth_t t, tlast;
    int any_occurred;
    fd_set rfds, wfds, efds;
    struct timeval delay, *pdelay;
    int fdmax = -1, rc, n;
    char minibuf[128]; // used to drain the signal pipe after unblocking
    pid_t pid;

    FD_ZERO(&rfds);
    FD_ZERO(&wfds);
    FD_ZERO(&efds);

    for ( t  = pth_pqueue_head(&pth_WQ); 
          t != NULL; 
          t  = pth_pqueue_walk(&pth_WQ, t) ) 
    {
        if ( t->cancelreq == TRUE ) dopoll = TRUE;

        if ( t->events == NULL ) continue;
        ev = evh = t->events;

        do {
            if ( !ev->ev_occurred ) 
            {
                /* Filedescriptor I/O */
                if ( ev->ev_type == PTH_EVENT_FD ) 
                {
                    /* filedescriptors are checked later all at once.
                       Here we only assemble them in the fd sets */
                    if ( ev->ev_goal & PTH_UNTIL_FD_READABLE )
                        FD_SET(ev->ev_args.FD.fd, &rfds);
                    if ( ev->ev_goal & PTH_UNTIL_FD_WRITEABLE )
                        FD_SET(ev->ev_args.FD.fd, &wfds);
                    if ( ev->ev_goal & PTH_UNTIL_FD_EXCEPTION )
                        FD_SET(ev->ev_args.FD.fd, &efds);
                    if ( fdmax < ev->ev_args.FD.fd ) fdmax = ev->ev_args.FD.fd;
                }
                /* Filedescriptor Set Select I/O */
                else if ( ev->ev_type == PTH_EVENT_SELECT ) 
                {
                    /* filedescriptors are checked later all at once.
                       Here we only merge the fd sets. */
                    pth_util_fds_merge(ev->ev_args.SELECT.nfd, 
                                       ev->ev_args.SELECT.rfds, &rfds,
                                       ev->ev_args.SELECT.wfds, &wfds,
                                       ev->ev_args.SELECT.efds, &efds);
                    if ( fdmax < ev->ev_args.SELECT.nfd-1 )
                        fdmax = ev->ev_args.SELECT.nfd-1;
                }
                /* Signal Set */
                else if ( ev->ev_type == PTH_EVENT_SIGS ) 
                {
                    if ( pth_sigraised == TRUE ) 
                    {
                        pth_sigraised = FALSE;
                        ev->ev_occurred = TRUE;
                    }
                    else
                    {
                        pth_sigfdwrite = TRUE;

                        if ( pth_sigraised == TRUE )
                        {
                            pth_sigfdwrite  = FALSE;
                            pth_sigraised = FALSE;
                            ev->ev_occurred = TRUE;
                        }

                        FD_SET(pth_sigpipe[0], &rfds);
                        if ( fdmax < pth_sigpipe[0]) fdmax = pth_sigpipe[0];
                    }
                }
                /* Timer */
                else if ( ev->ev_type == PTH_EVENT_TIME ) 
                {
                    if ( pth_time_cmp(&(ev->ev_args.TIME.tv), now) < 0 )
                    {
                        ev->ev_occurred = TRUE;
                    } 
                    else if ( nexttimer_thread == NULL ||
                              pth_time_cmp(&(ev->ev_args.TIME.tv), 
                                           &nexttimer_value) < 0 ) 
                    {
                        nexttimer_thread = t;
                        nexttimer_ev = ev;
                        pth_time_set(&nexttimer_value, &(ev->ev_args.TIME.tv));
                    }
                }
                /* Mutex Release */
                else if ( ev->ev_type == PTH_EVENT_MUTEX &&
                          !(ev->ev_args.MUTEX.mutex->mx_state & 
                            PTH_MUTEX_LOCKED) )
                {
                    ev->ev_occurred = TRUE; 
                }
                /* Condition Variable Signal */
                else if ( ev->ev_type == PTH_EVENT_COND &&
                          ev->ev_args.COND.cond->cn_state & PTH_COND_SIGNALED ) 
                {
                    if ( ev->ev_args.COND.cond->cn_state & PTH_COND_BROADCAST )
                    {
                        ev->ev_occurred = TRUE;
                    } 
                    else if ( !(ev->ev_args.COND.cond->cn_state & 
                                PTH_COND_HANDLED) ) 
                    {
                        ev->ev_args.COND.cond->cn_state |= PTH_COND_HANDLED;
                        ev->ev_occurred = TRUE; 
                    }
                }
                /* Thread Termination */
                else if ( ev->ev_type == PTH_EVENT_TID && 
                          ((ev->ev_args.TID.tid == NULL && 
                            pth_pqueue_elements(&pth_DQ) > 0) ||
                           (ev->ev_args.TID.tid != NULL && 
                            ev->ev_args.TID.tid->state == ev->ev_goal)) )
                {
                    ev->ev_occurred = TRUE; 
                }
                /* Process Termination */
                else if ( ev->ev_type == PTH_EVENT_PID ) 
                {
                    while ( (pid = pth_sc(waitpid)(
                        ev->ev_args.PID.pid, 
                        ev->ev_args.PID.status, 
                        ev->ev_args.PID.flags|WNOHANG)) < 0 && errno == EINTR);
                    if ( pid > 0 ) ev->ev_occurred = TRUE; 
                }
                /* Custom Event Function */
                else if ( ev->ev_type == PTH_EVENT_FUNC &&
                          ev->ev_args.FUNC.func(ev->ev_args.FUNC.func_arg) )
                {
                    ev->ev_occurred = TRUE; 
                }
            }
            if ( ev->ev_occurred ) dopoll = TRUE;
        } while ((ev = ev->ev_next) != evh);
    }

    if ( dopoll ) 
    {
        /*
         * Setting timeout to zero causes immediate return from poll (KAF)
         */
        pth_time_set(&delay, PTH_TIME_ZERO);
        pdelay = &delay;
    }
    else if ( nexttimer_thread ) 
    {
        pth_time_set(&delay, &nexttimer_value);
        pth_time_sub(&delay, now);
        pdelay = &delay;
    }
    else 
    {
        pdelay = NULL;
    }

    rc = -1;
    if ( !dopoll || fdmax != -1 )
    {
        while ((rc = pth_sc(select)(fdmax+1, &rfds, &wfds, &efds, pdelay)) < 0 
               && errno == EINTR) ; 
    }

    if ( pth_sigfdwrite )
    {
        pth_sigfdwrite = FALSE;
        /* clear pipe and let select() wait for the read-part of the pipe */
        if ( pth_sigraised )
        {
            while (pth_sc(read)(pth_sigpipe[0], minibuf, sizeof(minibuf)) == 
                   sizeof(minibuf)) ;
        }
    }

    /* when the timer elapsed then handle it */
    if ( !dopoll && rc == 0 && nexttimer_thread != NULL ) 
    {
        nexttimer_ev->ev_occurred = TRUE;
    }

    /* for all threads in the waiting queue... */
    t = pth_pqueue_head(&pth_WQ); 
    while (t != NULL) {
        /* do the late handling of the fd I/O and signal
           events in the waiting event ring */
        any_occurred = FALSE;
        if ( t->events != NULL ) 
        {
            ev = evh = t->events;
            do 
            {
                if ( !ev->ev_occurred ) 
                {
                    /* Filedescriptor I/O */
                    if ( ev->ev_type == PTH_EVENT_FD &&
                         ((ev->ev_goal & PTH_UNTIL_FD_READABLE &&
                           FD_ISSET(ev->ev_args.FD.fd, &rfds)) ||
                          (ev->ev_goal & PTH_UNTIL_FD_WRITEABLE &&
                           FD_ISSET(ev->ev_args.FD.fd, &wfds)) ||
                          (ev->ev_goal & PTH_UNTIL_FD_EXCEPTION &&
                           FD_ISSET(ev->ev_args.FD.fd, &efds))) ) 
                    {
                        ev->ev_occurred = TRUE;
                    }
                    /* Filedescriptor Set I/O */
                    else if ( ev->ev_type == PTH_EVENT_SELECT &&
                              pth_util_fds_test(
                                  ev->ev_args.SELECT.nfd, 
                                  ev->ev_args.SELECT.rfds, &rfds,
                                  ev->ev_args.SELECT.wfds, &wfds,
                                  ev->ev_args.SELECT.efds, &efds) ) 
                    {
                        n = pth_util_fds_select(ev->ev_args.SELECT.nfd, 
                                                ev->ev_args.SELECT.rfds, &rfds,
                                                ev->ev_args.SELECT.wfds, &wfds,
                                                ev->ev_args.SELECT.efds, &efds);
                        if (ev->ev_args.SELECT.n != NULL)
                            *(ev->ev_args.SELECT.n) = n;
                        ev->ev_occurred = TRUE;
                    }
                    /* Signal Set */
                    else if ( ev->ev_type == PTH_EVENT_SIGS & pth_sigraised ) 
                    {
                        pth_sigraised = FALSE;
                        ev->ev_occurred = TRUE;
                    }
                }
                else if ( ev->ev_type == PTH_EVENT_COND && 
                          ev->ev_args.COND.cond->cn_state & PTH_COND_SIGNALED )
                {
                    ev->ev_args.COND.cond->cn_state &= 
                       ~(PTH_COND_SIGNALED|PTH_COND_BROADCAST|PTH_COND_HANDLED);
                }

                if ( ev->ev_occurred ) any_occurred = TRUE;
            } while ((ev = ev->ev_next) != evh);
        }

        /* walk to next thread in waiting queue */
        tlast = t;
        t = pth_pqueue_walk(&pth_WQ, t);
        
        /* 
         * move last thread to ready queue when any events occurred for it.
         * we insert it with a slightly increased queue priority to it a
         * better chance to immediately get scheduled, else the last running
         * thread might immediately get again the CPU which is usually not
         * what we want, because we oven use pth_yield() calls to give others
         * a chance.
         */
        if ( tlast->cancelreq || any_occurred ) 
        {
            pth_pqueue_delete(&pth_WQ, tlast);
            tlast->state = PTH_STATE_READY;
            pth_pqueue_insert(&pth_RQ, tlast->prio+1, tlast);
        }
    }
}

void dummy_fn(unsigned int x) {}
void (*pth_async_sighandler) (unsigned int) = dummy_fn;

intern void pth_sched_eventmanager_sighandler(int sig, 
                                              siginfo_t *info, 
                                              void *c)
{
    (*pth_async_sighandler)(info->si_value.sival_int);

    pth_sigraised = TRUE;
    if ( pth_sigfdwrite ) 
    {
        char c;
        pth_sc(write)(pth_sigpipe[1], &c, 1);
    }
}

