#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <float.h>
#include <alloca.h>
#include <math.h>
#include <time.h>
#include <assert.h>

#ifdef _OPENMP
# include <omp.h>
#else
# define omp_get_max_threads()  1
# define omp_get_thread_num()   0
#endif

#define max(x,y)        (((x) > (y)) ? (x) : (y))

#ifndef LEAF_TASKS_PER_THREAD
# define LEAF_TASKS_PER_THREAD 4
#endif

#ifndef ARRAY_SIZE
# define ARRAY_SIZE 10
#endif

#ifndef REPETITIONS
# define REPETITIONS 1
#endif

double A[ARRAY_SIZE][ARRAY_SIZE];

/* computes factorial of a non-negative integer the long way */
double factorial( int n ) {
    double f;
    int i;

    f = 1.0;
    for (i=2; i<=n; i++) f *= ((double) i);
    return f;
}

/* forward decl for mutually-recursive routines */
void permanent(int n, int x[], int d, int qd, double *perm);

void permanent_taskq(int n, int x[], int d, int qd, double *perm) {
    int i;
    double p = 0.0;

    /* pick A[ x[0..n-1] ][ n-1 ] as row to expand */
    int *new_x = (int *) alloca (n * (n-1) * sizeof(int));
    #pragma omp taskq firstprivate(new_x)
    {
        int xc;
        for (xc=0; xc<n; xc++) {

            /* new_x is x with element xc skipped */
            #pragma omp task private(i)
            {
                double q = 0.0;

                for (i=0; i<xc; i++)   new_x[xc*(n-1) + i] = x[i  ];
                for (i=xc; i<n-1; i++) new_x[xc*(n-1) + i] = x[i+1];
                permanent(n-1, &new_x[xc*(n-1)], d+1, qd, &q);
                p += A[ x[xc] ][ n-1 ] * q;
            }
        }
    } /* wait, new_x is being used by children */

    /* this need only be synchronized at the top-level of taskq */
    if ( d == 1 ) {
        #pragma omp critical(perm_calc)
        *perm += p;
    } else {
        *perm += p;
    }

}

void permanent(int n, int x[], int d, int qd, double *perm) {
    int i;

    switch (n) {
      case 1:
        *perm = A[x[0]][0];
    break;

      default:

    if ( d <= qd ) {
            permanent_taskq(n, x, d, qd, perm);
    } else {

        double p = 0.0;
        int xc;
        int *new_x = (int *) alloca (n * (n-1) * sizeof(int));

        /* pick A[ x[0..n-1] ][ n-1 ] as row to expand */
        for (xc=0; xc<n; xc++) {

        double q = 0.0;

        /* new_x is x with element xc skipped */
        for (i=0; i<xc; i++)   new_x[xc*(n-1) + i] = x[i  ];
        for (i=xc; i<n-1; i++) new_x[xc*(n-1) + i] = x[i+1];

        permanent(n-1, &new_x[xc*(n-1)], d+1, qd, &q);
        p += A[ x[xc] ][ n-1 ] * q;
        }
            *perm += p;
    }
    break;
    }
    return;
}

void main() {
    int i, j, qdepth, numthreads, rep;
    double per, leaftasks, calc_nat_e, actual_e;
    int x[ARRAY_SIZE];

    leaftasks = 1.0;
    numthreads = omp_get_max_threads();

    for (i=ARRAY_SIZE, qdepth=0;
         i>0 && leaftasks / ((double) numthreads) < LEAF_TASKS_PER_THREAD;
     i--, qdepth++) {

    leaftasks *= (double) i;
    }

    printf("qdepth = %d, leaf tasks = %.0f, threads = %d, LTPT = %.2f\n",
             qdepth, leaftasks, numthreads, leaftasks / ((double)numthreads) );

    /* initialize column index array */
    for (i = 0; i < ARRAY_SIZE; i++) x[i] = i;

    /* set up matrix with 1's everywhere but the diagonal */
    for (i = 0; i < ARRAY_SIZE; i++) {
    for (j = 0; j < ARRAY_SIZE; j++) {
        A[i][j] = (i==j) ? 0.0 : 1.0;
    }
    }

    for (rep = 1; rep <= REPETITIONS; rep++) {

    printf("\nRepetition: %d\n", rep);

        per = 0.0;
        #pragma omp parallel default(none) shared(x, qdepth, per)
        {
        permanent(ARRAY_SIZE, x, 1, qdepth, &per);
        }
    calc_nat_e = factorial(ARRAY_SIZE)/per;
    actual_e = exp(1.0);

    /* make sure result is close enough to actual value */
        assert(abs(calc_nat_e - actual_e) <= max( pow(0.1, ARRAY_SIZE-3), DBL_EPSILON));
    }

    printf("approximation for natural log base: %20.16e\n", calc_nat_e);
    printf("actual value for natural log base:  %20.16e\n", actual_e);

    printf("Run successful!\n");
}
