/*
 * Copyright (c) 2015
 *	The President and Fellows of Harvard College.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * bigfork - concurrent VM test that behaves somewhat better than
 * parallelvm.
 *
 * This test is a mixture of forktest and parallelvm: it does nested
 * forks like forktest, and aimless matrix operations like parallelvm;
 * the goal is to serve as a performance benchmark more than as a
 * stress test (though it can be that too) and in particular to
 * exhibit less timing variance than parallelvm does. The variance is
 * still fairly high, but the variance of parallelvm is horrific.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <err.h>

#define BRANCHES 6

/*
 * 6 branches gives 64 procs at the final stage, and we want this to
 * use about 4M. So each proc's memory load should be about 1/16M or
 * 64K. Which is 16384 ints, or four 64x64 matrixes.
 */
#define DIM 64

static int m1[DIM * DIM], m2[DIM * DIM], m3[DIM * DIM], m4[DIM * DIM];
static const int right[BRANCHES] = {
    536763422, 478946723, 375722852, 369910585, 328220902, 62977821,
};
static unsigned failures;

static void init(void) {
  unsigned i, j;

  srandom(73771);
  for (i = 0; i < DIM; i++) {
    for (j = 0; j < DIM; j++) {
      m1[i * DIM + j] = random() % 11 - 5;
    }
  }
}

static void add(int *x, const int *a, const int *b) {
  unsigned i, j;

  for (i = 0; i < DIM; i++) {
    for (j = 0; j < DIM; j++) {
      x[i * DIM + j] = a[i * DIM + j] + b[i * DIM + j];
    }
  }
}

static void mul(int *x, const int *a, const int *b) {
  unsigned i, j, k;

  for (i = 0; i < DIM; i++) {
    for (j = 0; j < DIM; j++) {
      x[i * DIM + j] = 0;
      for (k = 0; k < DIM; k++) {
        x[i * DIM + j] += a[i * DIM + k] * b[k * DIM + j];
      }
    }
  }
}

static void scale(int *x, const int *a, int b) {
  unsigned i, j;

  for (i = 0; i < DIM; i++) {
    for (j = 0; j < DIM; j++) {
      x[i * DIM + j] = a[i * DIM + j] / b;
    }
  }
}

static void grind(void) {
  /*
   * compute: m2 = m1*m1, m3 = m2+m1, m4 = m3*m3, m1 = m4 / 2
   */
  mul(m2, m1, m1);
  add(m3, m2, m1);
  mul(m4, m3, m3);
  scale(m1, m4, 2);
}

static int trace(void) {
  unsigned i;
  int val = 0;

  for (i = 0; i < DIM; i++) {
    val += m1[i * DIM + i];
  }
  while (val < 0) {
    val += 0x20000000;
  }
  return val % 0x20000000;
}

static pid_t dofork(void) {
  pid_t pid;

  pid = fork();
  if (pid < 0) {
    warn("fork");
  }
  return pid;
}

static void dowait(pid_t pid) {
  int status;

  if (pid == -1) {
    failures++;
    return;
  }
  if (pid == 0) {
    exit(failures);
  } else {
    if (waitpid(pid, &status, 0) < 0) {
      warn("waitpid(%d)", pid);
    } else if (WIFSIGNALED(status)) {
      warnx("pid %d: signal %d", pid, WTERMSIG(status));
    } else if (WEXITSTATUS(status) > 0) {
      failures += WEXITSTATUS(status);
    }
  }
}

static void dotest(void) {
  unsigned i, me;
  pid_t pids[BRANCHES];
  int t;
  char msg[128];

  me = 0;
  for (i = 0; i < BRANCHES; i++) {
    pids[i] = dofork();
    if (pids[i] == 0) {
      me += 1U << i;
    }
    grind();
    t = trace();
    if (t == right[i]) {
      snprintf(msg, sizeof(msg), "Stage %u #%u done: %d\n", i, me, trace());
    } else {
      snprintf(msg, sizeof(msg), "Stage %u #%u FAILED: got %d, expected %d\n",
               i, me, t, right[i]);
      failures++;
    }
    (void)write(STDOUT_FILENO, msg, strlen(msg));
  }

  for (i = BRANCHES; i-- > 0;) {
    dowait(pids[i]);
  }
  if (failures > 0) {
    printf("%u failures.\n", failures);
  } else {
    printf("Done.\n");
  }
}

int main(void) {
  init();
  dotest();
  return 0;
}