#include "temu-c/Memory/Memory.h"
#include "temu-c/Models/Device.h"
#include "temu-c/Models/Reset.h"
#include "temu-c/Support/Attributes.h"
#include "temu-c/Support/Bitmanip.h"
#include "temu-c/Support/CommandLine.h"
#include "temu-c/Support/Logging.h"
#include "temu-c/Support/Objsys.h"
#include "temu-c/Support/VTables.h"

#include <assert.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>

// Big endian RAM implementation.
//
// This shows how a custom big-endian RAM can be implemented
// (which is the RAM that can be accessed by big endian processors in TEMU).
// 
// For Little Endian Processors, another RAM type taking LE into account
// is needed.
// 
// It does not show how to save the data, typically, one would do this
// with a custom serialization function in the object interface,
// that writes the RAM content to disk.
//
// The serialization function would then write the filename to the
// JSON snapshot using the TEMU API.
//
// The assumption by TEMU is that all transactions are aligned on their natural
// sizes, this is guaranteed by allocating buffers using page alignment.
// The assumption is a strict requirement if one would like to
// run TEMU in multi-threaded mode.
//
// This model implements both the legacy MemoryInterface,
// and large transaction support.

namespace {

struct Ram {
  temu_Object Super;

  uint64_t Size; //!< Size of RAM in bytes.
  uint8_t *Data; // Data points out the data buffer used by the RAM.
                 // This buffer must be page aligned to ensure that
                 // the host processor can emit atomic reads / writes
                 // without locking the bus.
                 // The TEMU CPU models relies on this for the correct
                 // behavior.
                 // E.g. x86 guarantees correctness of reads and writes,
                 // for transactions that fit in a page, or in a cache line.
                 // Other transactions firstly will fail to show the same
                 // atomicity behavior as is expected by e.g. the SPARC targets,
                 // but will also run slow
                 // (e.g. some x86 implementations would suffer hundreds
                 // of cycles in penalty for a transaction
                 // that is split over two pages). 
};

// Different memcpy operations.
// 
// TEMU assumes that RAM (and ROM) is laid out in host endian words.
// 
// In a sim model, we typically want these words in host endianess as well.
// However, there are obviously exceptions.
//
// We have these operations:
//
//    Copy from buffers in simulation endianess (i.e. normal host endianess)
//         to buffers in internal emulator endianess (host endian words)
//
//    Copy from buffers in emulator endianess (i.e. host endian words)
//         to buffers in simulator endianess (i.e. normal host endianess)
//
// Because of this assumption, we need special memcopy functions to support
// handling of different base unit sizes.
// 
// These memcpy functions can copy arrays of uint8_t, uint16_t, uint32_t and
// uint64_t in simulation endianess, to the assumed RAM endianess.
//
// Operations on byte and half word arrays rely on xoring the source
// or destination indices with 3 or 2 respectivelly.
//
// Words will be treated as a normal memcpy.
// 
// In addition, if a double word is copied, we need to swap the
// two 32 bit sub words around, but not their individual endianess.
//
// Note that this copying assumes that data buffers are properly aligned.
// I.e. an array of 4 byte words must be aligned on 4 byte boundaries.
//
// The Rd functions swaps from the source buffer (note that they
// functions take buffers and indices). While the Wr functions swaps
// on the destination buffer. In practice this means that the Rd
// functions are used to read from emulated memory that is stored in
// host-endian-ordered words, while the Wr functions are used to write
// to emulated memory.

void
memcpy8Rd(uint8_t *dest, const uint8_t *src, uint64_t di, uint64_t si,
          const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++di) {
    dest[di] = src[(si + i) ^ 3];
  }
}

void
memcpy8Wr(uint8_t *dest, const uint8_t *src, uint64_t di, uint64_t si,
          const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++si) {
    dest[(di + i) ^ 3] = src[si];
  }
}

void
memcpy16Rd(uint16_t *dest, const uint16_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++di) {
    dest[di] = src[(si + i) ^ 1];
  }
}

void
memcpy16Wr(uint16_t *dest, const uint16_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++si) {
    dest[(di + i) ^ 1] = src[si];
  }
}

void
memcpy32Rd(uint32_t *dest, const uint32_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  memcpy((uint8_t *)&dest[di], (uint8_t *)&src[si], len * 4);
}

void
memcpy32Wr(uint32_t *dest, const uint32_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  memcpy((uint8_t *)&dest[di], (uint8_t *)&src[si], len * 4);
}

void
memcpy64Rd(uint64_t *dest, const uint64_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++di, ++si) {
    dest[di] = temu_swapBigHost64Word(src[si]);
  }
}

void
memcpy64Wr(uint64_t *dest, const uint64_t *src, uint64_t di, uint64_t si,
           const uint64_t len)
{
  for (uint64_t i = 0; i < len; ++i, ++di, ++si) {
    dest[di] = temu_swapBigHost64Word(src[si]);
  }
}

void
memcpySzRd(void *dest, const void *src, uint64_t di, uint64_t si, uint64_t len,
           int sz)
{
  switch (sz) {
  case 0:
    memcpy8Rd((uint8_t *)dest, (const uint8_t *)src, di, si, len);
    break;
  case 1:
    memcpy16Rd((uint16_t *)dest, (const uint16_t *)src, di / 2, si / 2,
               len / 2);
    break;
  case 2:
    memcpy32Rd((uint32_t *)dest, (const uint32_t *)src, di / 4, si / 4,
               len / 4);
    break;
  case 3:
    memcpy64Rd((uint64_t *)dest, (const uint64_t *)src, di / 8, si / 8,
               len / 8);
    break;
  default:
    assert(0 && "invalid memcopy word-size");
  }
}

void
memcpySzWr(void *dest, const void *src, uint64_t di, uint64_t si, uint64_t len,
           int sz)
{
  switch (sz) {
  case 0:
    memcpy8Wr((uint8_t *)dest, (const uint8_t *)src, di, si, len);
    break;
  case 1:
    memcpy16Wr((uint16_t *)dest, (const uint16_t *)src, di / 2, si / 2,
               len / 2);
    break;
  case 2:
    memcpy32Wr((uint32_t *)dest, (const uint32_t *)src, di / 4, si / 4,
               len / 4);
    break;
  case 3:
    memcpy64Wr((uint64_t *)dest, (const uint64_t *)src, di / 8, si / 8,
               len / 8);
    break;
  default:
    assert(0 && "invalid memcopy word-size");
  }
}

// When writing to the size property, we will unmap the old data and map
// in new memory.
// That means that all the RAM content will be erased.
static void
writeSizeProp(void *obj, temu_Propval pv, int idx TEMU_UNUSED)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  if (ram->Size) {
    munmap(ram->Data, ram->Size);
  }
  ram->Size = temu_propValueU64(pv);
  ram->Data =
      (uint8_t *)mmap(nullptr, temu_propValueU64(pv), PROT_READ | PROT_WRITE,
                      MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
}

void *
createRam(const char *name TEMU_UNUSED, int argc, const temu_CreateArg *argv)
{
  auto ram = new Ram;
  memset(ram, 0, sizeof(Ram));

  for (int i = 0; i < argc; ++i) {
    if (!strcmp(argv[i].Key, "size")) {
      if (temu_isNumber(argv[i].Val)) {
        writeSizeProp(ram, temu_makePropU64(temu_asUnsigned(argv[i].Val)), 0);
      } else {
        temu_logError(nullptr, "creating RAM with bad size arg");
      }
    }
  }

  return ram;
}

void
disposeRam(void *obj)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  if (ram->Data) {
    munmap(ram->Data, ram->Size);
  }

  delete ram;
}

static void
fetchFromRam(void *obj, temu_MemTransaction *mt)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  uint64_t offset = mt->Offset;

  if (offset > ram->Size) {
    mt->Value = 0;
    mt->Page = nullptr;
    return;
  }
  assert((mt->Size >> 2) == 0);
  switch (mt->Size) {
  case 0:
    mt->Value = ((uint8_t *)ram->Data)[offset ^ 3];
    break;
  case 1:
    mt->Value = ((uint16_t *)ram->Data)[(offset ^ 2) >> 1];
    break;
  case 2:
    mt->Value = ((uint32_t *)ram->Data)[offset >> 2];
    break;
  case 3:
    mt->Value = ((uint64_t *)ram->Data)[offset >> 3];
    break;
  }

  mt->Page = &((uint8_t *)ram->Data)[offset & 0xfffff000];
}

static void
readFromRam(void *obj, temu_MemTransaction *mt)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  uint64_t offset = mt->Offset;

  if (offset > ram->Size) {
    mt->Value = 0;
    mt->Page = nullptr;
    return;
  }

  uint64_t units = mt->Size >> 2;
  if (units) {
    // The memory space automatically swaps small transactions
    // with differing endianess from the model.
    // However for large transactions, we need to swap the buffers manually.
    const bool swapData = mt->Flags & TEMU_MT_LITTLE_ENDIAN;
    switch (mt->Size & 3) {
    case 0:
      for (int i = 0, e = units; i < e; ++i) {
        ((uint8_t *)mt->Value)[i] = ((uint8_t *)ram->Data)[(offset + i) ^ 3];
      }
      break;
    case 1:
      for (int i = 0, e = units; i < e; ++i) {
        uint16_t data = ((uint16_t *)ram->Data)[((offset + (i * 2)) ^ 2) >> 1];
        data = swapData ? temu_swap16(data) : data;
        ((uint16_t *)mt->Value)[i] = data;
      }
      break;
    case 2:
      for (int i = 0, e = units; i < e; ++i) {
        uint32_t data = ((uint32_t *)ram->Data)[(offset + (i * 4)) >> 2];
        data = swapData ? temu_swap32(data) : data;
        ((uint32_t *)mt->Value)[i] = data;
      }
      break;
    case 3:
      for (int i = 0, e = units; i < e; ++i) {
        uint64_t data = ((uint64_t *)ram->Data)[(offset + (i * 8)) >> 3];
        data = swapData ? temu_swap64(data) : data;
        ((uint64_t *)mt->Value)[i] = temu_swap64Word(data);
      }
      break;
    }
  } else {
    switch (mt->Size) {
    case 0:
      mt->Value = ((uint8_t *)ram->Data)[offset ^ 3];
      break;
    case 1:
      mt->Value = ((uint16_t *)ram->Data)[(offset ^ 2) >> 1];
      break;
    case 2:
      mt->Value = ((uint32_t *)ram->Data)[offset >> 2];
      break;
    case 3:
      mt->Value = ((uint64_t *)ram->Data)[offset >> 3];
      break;
    }
  }
  mt->Page = &((uint8_t *)ram->Data)[offset & 0xfffff000];
}

static void
writeToRam(void *obj, temu_MemTransaction *mt)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  uint64_t offset = mt->Offset;

  if (offset > ram->Size) {
    mt->Page = nullptr;
    return;
  }

  uint64_t units = mt->Size >> 2;
  if (units) {
    // Only runtime API and device models can issue large transactions.
    // 
    // The memory space automatically swaps small transactions
    // with differing endianess from the model.
    // However for large transactions, we need to swap the buffers manually.
    const bool swapData = mt->Flags & TEMU_MT_LITTLE_ENDIAN;
    switch (mt->Size & 3) {
    case 0:
      for (int i = 0, e = units; i < e; ++i) {
        ((uint8_t *)ram->Data)[(offset + i) ^ 3] = ((uint8_t *)mt->Value)[i];
      }
      break;
    case 1:
      for (int i = 0, e = units; i < e; ++i) {
        uint16_t data = ((uint16_t *)mt->Value)[i];
        data = swapData ? temu_swap16(data) : data;
        ((uint16_t *)ram->Data)[((offset + (i * 2)) ^ 2) >> 1] = data;
      }
      break;
    case 2:
      for (int i = 0, e = units; i < e; ++i) {
        uint32_t data = ((uint32_t *)mt->Value)[i];
        data = swapData ? temu_swap32(data) : data;

        ((uint32_t *)ram->Data)[(offset + (i * 4)) >> 2] = data;
      }
      break;
    case 3:
      for (int i = 0, e = units; i < e; ++i) {
        uint64_t data = ((uint64_t *)mt->Value)[i];
        data = swapData ? temu_swap64(data) : data;
        ((uint64_t *)ram->Data)[(offset + (i * 8)) >> 3] = data;
      }
      break;
    }
  } else {
    // Processors only issue small transactions,
    // so we end up here in these cases.
    switch (mt->Size) {
    case 0:
      ((uint8_t *)ram->Data)[offset ^ 3] = mt->Value;
      break;
    case 1:
      ((uint16_t *)ram->Data)[(offset ^ 2) >> 1] = mt->Value;
      break;
    case 2:
      ((uint32_t *)ram->Data)[offset >> 2] = mt->Value;
      break;
    case 3:
      ((uint64_t *)ram->Data)[offset >> 3] = mt->Value;
      break;
    }
  }
  // Ensure TEMU processors can cache the page pointer.
  mt->Page = &((uint8_t *)ram->Data)[offset & 0xfffff000];
}
const temu_MemAccessCapabilities *
getRamCapabilities(void *Obj)
{
  static const temu_MemAccessCapabilities caps = {
      TEMU_MEM_ACCESS_CAP_E_ALL | TEMU_MEM_ACCESS_CAP_R_ALL |
          TEMU_MEM_ACCESS_CAP_W_ALL | TEMU_MEM_ACCESS_CAP_F_ALL,

      1, // Large transaction support,
      teMK_RAM,
      teME_BigEndian,
  };

  return &caps;
}

void
probeRam(void *obj, temu_MemTransaction *mt)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  uint64_t offset = mt->Offset;
  mt->Page = &(ram->Data[offset & 0xfffff000]);
}

temu_MemAccessIface MemAccessIface = {
    fetchFromRam, readFromRam,        writeToRam, nullptr,
    nullptr,      getRamCapabilities, probeRam};

int
readBytesFromRam(void *obj, void *dest, uint64_t addr, uint32_t len, int swap)
{
  auto ram = reinterpret_cast<Ram *>(obj);

  if (addr + len > ram->Size) {
    temu_logError(obj, "readBytes: out of bounds");
    return -1;
  }

  memcpySzRd(dest, ram->Data, 0, addr, len, swap);

  return len;
}

int
writeBytesToRam(void *obj, uint64_t addr, uint32_t len, const void *src,
                int swap)
{
  auto ram = reinterpret_cast<Ram *>(obj);

  if (addr + len > ram->Size) {
    temu_logError(obj, "writeBytes: out of bounds");
    return -1;
  }
  memcpySzWr(ram->Data, src, addr, 0, len, swap);

  return len;
}

temu_MemoryIface MemoryIface = {
    readBytesFromRam,
    writeBytesToRam,
};

temu_MemVTable MemVTable = {
    &MemAccessIface,
    &MemoryIface,
};

void
reset(void *obj, int warm)
{
  auto ram = reinterpret_cast<Ram *>(obj);
  if (warm == 0) {
    if (ram->Size) {
      if (ram->Data) {
        munmap(ram->Data, ram->Size);
      }
      ram->Data = (uint8_t *)mmap(nullptr, ram->Size, PROT_READ | PROT_WRITE,
                                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    }
  }
}

temu_DeviceIface DevIface = {reset, nullptr};
temu_ResetIface ResetIface = {reset};

} // namespace

TEMU_PLUGIN_INIT
{
  temu_Class *c = temu_registerClass("BigEndianRam", createRam, disposeRam);
  temu_addInterface(c, "MemAccessIface", TEMU_MEM_ACCESS_IFACE_TYPE,
                    &MemAccessIface);
  temu_addInterface(c, "MemoryIface", TEMU_MEMORY_IFACE_TYPE, &MemoryIface);
  temu_addInterface(c, "DeviceIface", TEMU_DEVICE_IFACE_TYPE, &DevIface);
  temu_addInterface(c, "ResetIface", TEMU_RESET_IFACE_TYPE, &ResetIface);

  temu_addProperty(c, "size", offsetof(Ram, Size), teTY_U64, 1, writeSizeProp);

  temu_qualifyAsMemory(c);
  temu_setVTable(c, &MemVTable);

  // Attach a size parameter to the new-command.
  auto newCmd = temu_classGetCmd(c->Super.Class, "new");
  assert(newCmd != nullptr);
  temu_classCmdAddParam(newCmd, "size", teCOK_Int, 0, "Size of RAM in bytes");
}
