/* ########################################################################## */
/* (C) UPMC, 2010-2011                                                        */
/*     Authors:                                                               */
/*       Jean-Pierre Lozi <jean-pierre.lozi@lip6.fr>                          */
/*       Gaël Thomas <gael.thomas@lip6.fr>                                    */
/*       Florian David <florian.david@lip6.fr>                                */
/*       Julia Lawall <julia.lawall@lip6.fr>                                  */
/*       Gilles Muller <gilles.muller@lip6.fr>                                */
/* -------------------------------------------------------------------------- */
/* ########################################################################## */

/*
 * =============================================================================
 * Code based on Tudor David's libslock library.
 * =============================================================================
 */

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <assert.h>

#include "liblock.h"
#include "liblock-fatal.h"

#include "extra-libslock-utils.h"
#include "extra-libslock-platform-defs.h"
#include "extra-libslock-atomic-ops.h"


//#define NB_TICKETS_LOCAL 1024
#define NB_TICKETS_LOCAL 2147483647
#ifdef AMD48B
#define TICKET_BASE_WAIT 512
#define TICKET_MAX_WAIT  4095
#define TICKET_WAIT_NEXT 64
#elif defined(NIAGARA2)
#define TICKET_BASE_WAIT 16
#define TICKET_MAX_WAIT  63
#define TICKET_WAIT_NEXT 4
#endif


struct liblock_impl {
    unsigned int                       lock_id;
    pthread_mutex_t                    posix_lock;
    char __pad[pad_to_cache_line(sizeof(unsigned int) +
                                 sizeof(pthread_mutex_t))];
};

typedef struct htlock_global
{
    volatile uint32_t                  nxt;
    volatile uint32_t                  cur;
    uint8_t padding[CACHE_LINE_SIZE - 8];
} htlock_global_t;

typedef struct htlock_local
{
    volatile int32_t                   nxt;
    volatile int32_t                   cur;
    uint8_t padding[CACHE_LINE_SIZE - 8];
} htlock_local_t;

typedef struct ALIGNED(CACHE_LINE_SIZE) htlock
{
    htlock_global_t                   *global;
    htlock_local_t                    *local[NUMBER_OF_SOCKETS];
} htlock_t;

static htlock_t* htls;
static __thread uint32_t htlock_node_mine, htlock_id_mine;

static volatile unsigned int cur_lock_number = 0;

static inline void wait_cycles(uint64_t cycles)
{
    if (cycles < 256)
    {
        cycles /= 6;
        while (cycles--)
        {
            PAUSE;
        }
    }
    else
    {
        ticks _start_ticks = getticks();
        ticks _end_ticks = _start_ticks + cycles - 130;
        while (getticks() < _end_ticks);
    }
}

static inline uint32_t sub_abs(const uint32_t a, const uint32_t b)
{
    if (a > b)
    {
        return a - b;
    }
    else
    {
        return b - a;
    }
}

static inline void htlock_wait_ticket(htlock_local_t* lock,
                                      const uint32_t ticket)
{
#if defined(OPTERON_OPTIMIZE)
    uint32_t wait = TICKET_BASE_WAIT;
    uint32_t distance_prev = 1;

    while (1)
    {
        PREFETCHW(lock);
        int32_t lock_cur = lock->cur;
        if (lock_cur == ticket)
        {
            break;
        }
        uint32_t distance = sub_abs(lock->cur, ticket);
        if (distance > 1)
        {
            if (distance != distance_prev)
            {
                distance_prev = distance;
                wait = TICKET_BASE_WAIT;
            }

            nop_rep(distance * wait);
            wait = (wait + TICKET_BASE_WAIT) & TICKET_MAX_WAIT;
        }
        else
        {
            nop_rep(TICKET_WAIT_NEXT);
        }
    }
#else
    while (lock->cur != ticket)
    {
        uint32_t distance = sub_abs(lock->cur, ticket);
        if (distance > 1)
        {
            nop_rep(distance * TICKET_BASE_WAIT);
        }
        else
        {
            PAUSE;
        }
    }
#endif  /* OPTERON_OPTIMIZE */
}

static inline void htlock_wait_global(htlock_local_t* lock,
                                      const uint32_t ticket)
{
    while (lock->cur != ticket)
    {
        uint32_t distance = sub_abs(lock->cur, ticket);
        if (distance > 1)
        {
            wait_cycles(distance * 256);
        }
        else
        {
            PAUSE;
        }
    }
}

static void lock_extra_hticket(struct liblock_impl *impl)
{
    htlock_local_t* localp = htls[impl->lock_id].local[htlock_node_mine];
    int32_t local_ticket;

again_local:
    local_ticket = DAF_U32((uint32_t *)&localp->nxt);
    if (local_ticket < -1)
    {
        PAUSE;
        wait_cycles(-local_ticket * 120);
        PAUSE;
        goto again_local;
    }

    if (local_ticket >= 0)
    {
        htlock_wait_ticket((htlock_local_t*) localp, local_ticket);
    }
    else
    {
        do
        {
#if defined(OPTERON_OPTIMIZE)
            PREFETCHW(localp);
#endif
        } while (localp->cur != NB_TICKETS_LOCAL);
        localp->nxt = NB_TICKETS_LOCAL;

        htlock_global_t* globalp = htls[impl->lock_id].global;
        uint32_t global_ticket = FAI_U32(&globalp->nxt);

        htlock_wait_global((htlock_local_t*) globalp, global_ticket);
    }
}

static void unlock_extra_hticket(struct liblock_impl *impl)
{
    htlock_local_t* localp = htls[impl->lock_id].local[htlock_node_mine];
#if defined(OPTERON_OPTIMIZE)
    PREFETCHW(localp);
#endif
    int32_t local_cur = localp->cur;
    int32_t local_nxt = CAS_U32((volatile uint32_t *)&localp->nxt,
                                (uint32_t)local_cur, (uint32_t)0);
    if (local_cur == 0 || local_cur == local_nxt)
    {
#if defined(OPTERON_OPTIMIZE)
        PREFETCHW(htls[impl->lock_id].global);
        PREFETCHW(localp);
#endif
        localp->cur = NB_TICKETS_LOCAL;
        htls[impl->lock_id].global->cur++;
    }
    else
    {
#if defined(OPTERON_OPTIMIZE)
        PREFETCHW(localp);
#endif
        localp->cur = local_cur - 1;
    }
}

static struct liblock_impl *do_liblock_init_lock(extra_hticket)
                               (liblock_lock_t *lock,
                                struct hw_thread *core,
                                pthread_mutexattr_t *attr)
{
    struct liblock_impl *impl =
        liblock_allocate(sizeof(struct liblock_impl));

    pthread_mutex_init(&impl->posix_lock, 0);
    impl->lock_id = __sync_fetch_and_add(&cur_lock_number, 1);

    // create_htlock()
    htlock_t* htl = &htls[impl->lock_id];
    assert(htl != NULL);

    htl->global = memalign(CACHE_LINE_SIZE, sizeof(htlock_global_t));
    if (htl == NULL)
    {
        fprintf(stderr,"Error @ memalign : create htlock\n");
    }
    assert(htl->global != NULL);

    uint32_t s;
    for (s = 0; s < NUMBER_OF_SOCKETS; s++)
    {
#if defined(PLATFORM_NUMA)
        numa_set_preferred(s);
        htl->local[s] = (htlock_local_t*) numa_alloc_onnode(sizeof(htlock_local_t), s);
#else
        htl->local[s] = (htlock_local_t*) malloc(sizeof(htlock_local_t));
#endif
        htl->local[s]->cur = NB_TICKETS_LOCAL;
        htl->local[s]->nxt = 0;
        assert(htl->local != NULL);
    }

#if defined(PLATFORM_NUMA)
    numa_set_preferred(htlock_node_mine);
#endif

    htl->global->cur = 0;
    htl->global->nxt = 0;

    MEM_BARRIER;

    // init_htlock()
    assert(htl != NULL);
    htl->global->cur = 0;
    htl->global->nxt = 0;
    uint32_t n;
    for (n = 0; n < NUMBER_OF_SOCKETS; n++)
    {
        htl->local[n]->cur = NB_TICKETS_LOCAL;
        htl->local[n]->nxt = 0;
    }
    MEM_BARRIER;

    return impl;
}

static int do_liblock_destroy_lock(extra_hticket)(liblock_lock_t *lock)
{
    free(lock->impl);

    return 0;
}

static void* do_liblock_execute_operation(extra_hticket)(liblock_lock_t *lock,
                                                    void* (*pending)(void*),
                                                    void *val)
{
    struct liblock_impl *impl = lock->impl;
    void *res;

    lock_extra_hticket(impl);

    res = pending(val);

    unlock_extra_hticket(impl);

    return res;
}

static htlock_t *create_htlock_no_alloc(htlock_t* htl,
                                        htlock_local_t* locals[NUMBER_OF_SOCKETS],
                                        size_t offset)
{
    htl->global = memalign(CACHE_LINE_SIZE, sizeof(htlock_global_t));
    if (htl == NULL)
    {
        fprintf(stderr,"Error @ memalign : create htlock\n");
    }
    assert(htl->global != NULL);

    uint32_t s;
    for (s = 0; s < NUMBER_OF_SOCKETS; s++)
    {
        htl->local[s] = locals[s] + offset;
    }

    htl->global->cur = 0;
    htl->global->nxt = 0;
    uint32_t n;
    for (n = 0; n < NUMBER_OF_SOCKETS; n++)
    {
        htl->local[n]->cur = NB_TICKETS_LOCAL;
        htl->local[n]->nxt = 0;
    }

    MEM_BARRIER;
    return htl;
}

static void do_liblock_init_library(extra_hticket)()
{
    // init_htlocks()
    htls = memalign(CACHE_LINE_SIZE, MAX_LOCKS * sizeof(htlock_t));
    if (htls == NULL)
    {
        fprintf(stderr, "Error @ memalign : init_htlocks\n");
    }
    assert(htls != NULL);


    size_t alloc_locks = (MAX_LOCKS < 64) ? 64 : MAX_LOCKS;

    htlock_local_t* locals[NUMBER_OF_SOCKETS];
    uint32_t n;
    for (n = 0; n < NUMBER_OF_SOCKETS; n++)
    {
#if defined(PLATFORM_NUMA)
        numa_set_preferred(n);
#endif
        locals[n] = (htlock_local_t*) calloc(alloc_locks, sizeof(htlock_local_t));
        *((volatile int*) locals[n]) = 33;
        assert(locals[n] != NULL);
    }

#if defined(AMD48B)
    numa_set_preferred(htlock_node_mine);
#endif

    uint32_t i;
    for (i = 0; i < MAX_LOCKS; i++)
    {
        create_htlock_no_alloc(htls + i, locals, i);
    }

    MEM_BARRIER;
}

static void do_liblock_kill_library(extra_hticket)()
{}

static void do_liblock_run(extra_hticket)(void (*callback)())
{
    if(__sync_val_compare_and_swap(&liblock_start_server_threads_by_hand,
                                   1, 0) != 1)
        fatal("servers are not managed by hand");
    if(callback)
        callback();
}

static int do_liblock_cond_init(extra_hticket)(liblock_cond_t* cond)
{
    return cond->has_attr ?
        pthread_cond_init(&cond->impl.posix_cond, &cond->attr) :
        pthread_cond_init(&cond->impl.posix_cond, 0);
}

static int cond_timedwait(liblock_cond_t* cond,
                          liblock_lock_t* lock,
                          const struct timespec* ts)
{
    struct liblock_impl *impl = lock->impl;
    int res;

    pthread_mutex_lock(&impl->posix_lock);

    unlock_extra_hticket(impl);

    if(ts)
        res = pthread_cond_timedwait(&cond->impl.posix_cond,
                                     &impl->posix_lock,
                                     ts);
    else
        res = pthread_cond_wait(&cond->impl.posix_cond, &impl->posix_lock);

    pthread_mutex_unlock(&impl->posix_lock);

    lock_extra_hticket(impl);

    return res;
}

static int do_liblock_cond_timedwait(extra_hticket)(liblock_cond_t* cond,
                                          liblock_lock_t* lock,
                                          const struct timespec* ts)
{
    return cond_timedwait(cond, lock, ts);
}

static int do_liblock_cond_wait(extra_hticket)(liblock_cond_t* cond,
                                     liblock_lock_t* lock)
{
    return cond_timedwait(cond, lock, 0);
}

static int do_liblock_cond_signal(extra_hticket)(liblock_cond_t* cond)
{
    return pthread_cond_signal(&cond->impl.posix_cond);
}

static int do_liblock_cond_broadcast(extra_hticket)(liblock_cond_t* cond)
{
    return pthread_cond_broadcast(&cond->impl.posix_cond);
}

static int do_liblock_cond_destroy(extra_hticket)(liblock_cond_t* cond)
{
    return pthread_cond_destroy(&cond->impl.posix_cond);
}

static void do_liblock_on_thread_start(extra_hticket)(struct thread_descriptor* desc)
{
    int phys_core = desc->id;

    htlock_id_mine = phys_core;
    htlock_node_mine = get_cluster(phys_core);

    MEM_BARRIER;
}

static void do_liblock_on_thread_exit(extra_hticket)(struct thread_descriptor* desc)
{}

static void do_liblock_unlock_in_cs(extra_hticket)(liblock_lock_t* lock)
{
    unlock_extra_hticket(lock->impl);
}

static void do_liblock_relock_in_cs(extra_hticket)(liblock_lock_t* lock)
{
    lock_extra_hticket(lock->impl);
}

static void do_liblock_declare_server(extra_hticket)(struct hw_thread* core)
{}

static void do_liblock_cleanup(extra_hticket)(void)
{}

liblock_declare(extra_hticket);

