/* ########################################################################## */
/* (C) UPMC, 2010-2011                                                        */
/*     Authors:                                                               */
/*       Jean-Pierre Lozi <jean-pierre.lozi@lip6.fr>                          */
/*       Gaël Thomas <gael.thomas@lip6.fr>                                    */
/*       Florian David <florian.david@lip6.fr>                                */
/*       Julia Lawall <julia.lawall@lip6.fr>                                  */
/*       Gilles Muller <gilles.muller@lip6.fr>                                */
/* -------------------------------------------------------------------------- */
/* ########################################################################## */

/*
 * =============================================================================
 * Code based on Tudor David's libslock library.
 * =============================================================================
 */

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>

#include "liblock.h"
#include "liblock-fatal.h"

#include "extra-libslock-utils.h"
#include "extra-libslock-platform-defs.h"
#include "extra-libslock-atomic-ops.h"

#define INIT_VAL 123


typedef struct node_fields {
    volatile uint8_t successor_must_wait;
    volatile uint8_t tail_when_spliced;
    volatile uint8_t cluster_id;
} node_fields;

typedef struct qnode {
    union {
        volatile uint32_t data;
        node_fields fields;
#ifdef ADD_PADDING
        volatile uint8_t padding[CACHE_LINE_SIZE];
#endif
    };
} qnode;

typedef volatile qnode *qnode_ptr;
typedef qnode_ptr local_queue;
typedef qnode_ptr global_queue;

struct liblock_impl {
    unsigned int                       lock_id;
    pthread_mutex_t                    posix_lock;
    char __pad[pad_to_cache_line(sizeof(unsigned int) +
                                 sizeof(pthread_mutex_t))];
};

typedef struct hclh_global_params {
    global_queue                      *shared_queue;
    local_queue                      **local_queues;
    volatile uint32_t                 *init_done;
    volatile uint8_t padding[CACHE_LINE_SIZE-20];
} hclh_global_params;

typedef struct hclh_local_params {
    qnode                             *my_qnode;
    qnode                             *my_pred;
    local_queue                       *my_queue;
} hclh_local_params;

static  hclh_global_params *global_params;
static __thread hclh_local_params *local_params;
static __thread uint32_t hclh_node_mine;

static volatile unsigned int cur_lock_number = 0;


static uint16_t wait_for_grant_or_cluster_master(volatile qnode *q,
                                                 uint8_t my_cluster)
{
    qnode aux;

    while (1)
    {
        aux.data = q->data;
        if ((aux.fields.cluster_id == my_cluster) &&
                (aux.fields.tail_when_spliced == 0) &&
                (aux.fields.successor_must_wait == 0))
            return 1;
        if (aux.fields.tail_when_spliced == 1)
            return 0;
        if (aux.fields.cluster_id!=my_cluster)
            return 0;
        PAUSE;
    }
}

static void lock_extra_hclh(struct liblock_impl *impl)
{
    local_queue *lq = local_params[impl->lock_id].my_queue;
    global_queue *gq = global_params[impl->lock_id].shared_queue;
    qnode *my_qnode = local_params[impl->lock_id].my_qnode;

    volatile qnode *my_pred;

    do
    {
#if defined(OPTERON_OPTIMIZE)
        PREFETCHW(lq);
#endif  /* OPTERON_OPTIMIZE */
        my_pred = *lq;
#ifdef __sparc__
    } while (CAS_PTR((void *)lq, (void *)my_pred, (void *)my_qnode) != my_pred);
#else
    } while (CAS_PTR(lq, my_pred, my_qnode) != my_pred);
#endif

    if (my_pred != NULL)
    {
        uint16_t i_own_lock =
            wait_for_grant_or_cluster_master(my_pred,
                                             my_qnode->fields.cluster_id);
        if (i_own_lock)
        {
//printf("I OWN LOCK! %d %d\n", /*sched_getcpu()*/getcpuid(), my_qnode->fields.cluster_id);
            local_params[impl->lock_id].my_pred = (qnode *)my_pred;
            return;
        }
    }
    PAUSE;  PAUSE;

    volatile qnode *local_tail;

    do
    {
#if defined(OPTERON_OPTIMIZE)
        PREFETCHW(gq);
        PREFETCHW(lq);
#endif  /* OPTERON_OPTIMIZE */
        my_pred = *gq;
        local_tail = *lq;
        PAUSE;
#ifdef __sparc__
    } while(CAS_PTR((volatile void *)gq, (void *)my_pred, (void *)local_tail) != my_pred);
#else
    } while(CAS_PTR(gq, my_pred, local_tail) != my_pred);
#endif

    local_tail->fields.tail_when_spliced = 1;
#if defined(OPTERON_OPTIMIZE)
    PREFETCHW(my_pred);
#endif  /* OPTERON_OPTIMIZE */

    while (my_pred->fields.successor_must_wait) {
        PAUSE;
#if defined(OPTERON_OPTIMIZE)
        pause_rep(23);
        PREFETCHW(my_pred);
#endif  /* OPTERON_OPTIMIZE */
    }

    local_params[impl->lock_id].my_pred = (qnode *)my_pred;
    return;
}

static void unlock_extra_hclh(struct liblock_impl *impl)
{
    qnode *my_qnode = local_params[impl->lock_id].my_qnode;
    qnode *my_pred = local_params[impl->lock_id].my_pred;

    my_qnode->fields.successor_must_wait = 0;
    qnode* pr = my_pred;
    qnode new_node;
    new_node.data = 0;
    new_node.fields.cluster_id = hclh_node_mine;
    new_node.fields.successor_must_wait = 1;
    new_node.fields.tail_when_spliced = 0;

#if defined(OPTERON_OPTIMIZE)
    PREFETCHW(pr);
#endif  /* OPTERON_OPTIMIZE */
    uint32_t old_data = pr->data;

    while (CAS_U32(&pr->data,old_data,new_node.data) != old_data)
    {
        old_data = pr->data;
        PAUSE;
#if defined(OPTERON_OPTIMIZE)
        PREFETCHW(pr);
#endif  /* OPTERON_OPTIMIZE */
    }

    my_qnode = pr;
    local_params[impl->lock_id].my_qnode = my_qnode;
}

static struct liblock_impl *do_liblock_init_lock(extra_hclh)
                               (liblock_lock_t *lock,
                                struct hw_thread *core,
                                pthread_mutexattr_t *attr)
{
    struct liblock_impl *impl =
        liblock_allocate(sizeof(struct liblock_impl));

    impl->lock_id = __sync_fetch_and_add(&cur_lock_number, 1);
    pthread_mutex_init(&impl->posix_lock, 0);

    return impl;
}

static int do_liblock_destroy_lock(extra_hclh)(liblock_lock_t *lock)
{
    free(lock->impl);
    return 0;
}

static void* do_liblock_execute_operation(extra_hclh)(liblock_lock_t *lock,
                                                    void* (*pending)(void*),
                                                    void *val)
{
    struct liblock_impl *impl = lock->impl;
    void *res;

    lock_extra_hclh(impl);

    res = pending(val);

    unlock_extra_hclh(impl);

    return res;
}

static void do_liblock_init_library(extra_hclh)()
{
    if (global_params) return;

    global_params =
        (hclh_global_params *)liblock_allocate(MAX_LOCKS *
                                               sizeof(hclh_global_params));
    uint32_t i;

    for (i = 0 ; i < MAX_LOCKS ; i++)
    {
        global_params[i].local_queues =
            (local_queue **)liblock_allocate(NUMBER_OF_SOCKETS *
                                             sizeof(local_queue*));
        global_params[i].init_done =
            (uint32_t *)liblock_allocate(NUMBER_OF_SOCKETS * sizeof(uint32_t));
        global_params[i].shared_queue =
            (global_queue *)liblock_allocate(sizeof(global_queue));
        qnode * a_node = (qnode *) liblock_allocate(sizeof(qnode));
        a_node->data = 0;
        a_node->fields.cluster_id = NUMBER_OF_SOCKETS + 1;
        *(global_params[i].shared_queue) = a_node;
    }

    MEM_BARRIER;
}

static void do_liblock_kill_library(extra_hclh)()
{
    uint32_t i;

    for (i = 0; i < MAX_LOCKS; i++)
    {
        free(global_params[i].shared_queue);
        free(global_params[i].local_queues);
    }

    free(global_params);
}

static void do_liblock_run(extra_hclh)(void (*callback)())
{
    if(__sync_val_compare_and_swap(&liblock_start_server_threads_by_hand,
                                   1, 0) != 1)
        fatal("servers are not managed by hand");
    if(callback)
        callback();
}

static int do_liblock_cond_init(extra_hclh)(liblock_cond_t* cond)
{
    return cond->has_attr ?
        pthread_cond_init(&cond->impl.posix_cond, &cond->attr) :
        pthread_cond_init(&cond->impl.posix_cond, 0);
}

static int cond_timedwait(liblock_cond_t* cond,
                          liblock_lock_t* lock,
                          const struct timespec* ts)
{
    struct liblock_impl *impl = lock->impl;
    int res;

    pthread_mutex_lock(&impl->posix_lock);

    unlock_extra_hclh(impl);

    if(ts)
        res = pthread_cond_timedwait(&cond->impl.posix_cond,
                                     &impl->posix_lock,
                                     ts);
    else
        res = pthread_cond_wait(&cond->impl.posix_cond, &impl->posix_lock);

    pthread_mutex_unlock(&impl->posix_lock);

    lock_extra_hclh(impl);

    return res;
}

static int do_liblock_cond_timedwait(extra_hclh)(liblock_cond_t* cond,
                                          liblock_lock_t* lock,
                                          const struct timespec* ts)
{
    return cond_timedwait(cond, lock, ts);
}

static int do_liblock_cond_wait(extra_hclh)(liblock_cond_t* cond,
                                     liblock_lock_t* lock)
{
    return cond_timedwait(cond, lock, 0);
}

static int do_liblock_cond_signal(extra_hclh)(liblock_cond_t* cond)
{
    return pthread_cond_signal(&cond->impl.posix_cond);
}

static int do_liblock_cond_broadcast(extra_hclh)(liblock_cond_t* cond)
{
    return pthread_cond_broadcast(&cond->impl.posix_cond);
}

static int do_liblock_cond_destroy(extra_hclh)(liblock_cond_t* cond)
{
    return pthread_cond_destroy(&cond->impl.posix_cond);
}

static void do_liblock_on_thread_start(extra_hclh)(struct thread_descriptor* desc)
{
    local_params =
        (hclh_local_params *)liblock_allocate(MAX_LOCKS *
                                              sizeof(hclh_local_params));
    uint32_t i;
    int phys_core = desc->id;

    hclh_node_mine = phys_core / CORES_PER_SOCKET;

    for (i = 0; i < MAX_LOCKS; i++)
    {
        local_params[i].my_qnode = (qnode *)liblock_allocate(sizeof(qnode));
        local_params[i].my_qnode->data = 0;
        local_params[i].my_qnode->fields.cluster_id =
            phys_core / CORES_PER_SOCKET;
        local_params[i].my_qnode->fields.successor_must_wait = 1;
        local_params[i].my_pred = NULL;

        if (phys_core % CORES_PER_SOCKET==0)
        {
            global_params[i].local_queues[phys_core / CORES_PER_SOCKET] =
                (local_queue *)liblock_allocate(sizeof(local_queue));
            *(global_params[i].local_queues[phys_core / CORES_PER_SOCKET]) = NULL;
            global_params[i].init_done[phys_core / CORES_PER_SOCKET] = INIT_VAL;
        }
        while (global_params[i].init_done[phys_core / CORES_PER_SOCKET] != INIT_VAL) {}
        local_params[i].my_queue =
            global_params[i].local_queues[phys_core / CORES_PER_SOCKET];
    }

    MEM_BARRIER;
}

static void do_liblock_on_thread_exit(extra_hclh)(struct thread_descriptor* desc)
{
    // We do not free local data to avoid potential overhead in benchmarks.
/*
    uint32_t i;

    for (i = 0; i < MAX_LOCKS; i++) {
        free(local_params[i].my_qnode);
    }
    free(local_params);
*/
}

static void do_liblock_unlock_in_cs(extra_hclh)(liblock_lock_t* lock)
{
    unlock_extra_hclh(lock->impl);
}

static void do_liblock_relock_in_cs(extra_hclh)(liblock_lock_t* lock)
{
    lock_extra_hclh(lock->impl);
}

static void do_liblock_declare_server(extra_hclh)(struct hw_thread* core)
{}

static void do_liblock_cleanup(extra_hclh)(void)
{}

liblock_declare(extra_hclh);

