//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
// This implements the interface for the profileCollectorService.
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
#include "sanitizer_common/sanitizer_common.h"
#include "xray_allocator.h"
#include "xray_defs.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
#include <memory>
#include <pthread.h>
#include <utility>

namespace __xray {
namespace profileCollectorService {

namespace {

SpinMutex GlobalMutex;
struct ThreadTrie {
  tid_t TId;
  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
};

struct ProfileBuffer {
  void *Data;
  size_t Size;
};

// Current version of the profile format.
constexpr u64 XRayProfilingVersion = 0x20180424;

// Identifier for XRay profiling files 'xrayprof' in hex.
constexpr u64 XRayMagicBytes = 0x7872617970726f66;

struct XRayProfilingFileHeader {
  const u64 MagicBytes = XRayMagicBytes;
  const u64 Version = XRayProfilingVersion;
  u64 Timestamp = 0; // System time in nanoseconds.
  u64 PID = 0;       // Process ID.
};

struct BlockHeader {
  u32 BlockSize;
  u32 BlockNum;
  u64 ThreadId;
};

struct ThreadData {
  BufferQueue *BQ;
  FunctionCallTrie::Allocators::Buffers Buffers;
  FunctionCallTrie::Allocators Allocators;
  FunctionCallTrie FCT;
  tid_t TId;
};

using ThreadDataArray = Array<ThreadData>;
using ThreadDataAllocator = ThreadDataArray::AllocatorType;

// We use a separate buffer queue for the backing store for the allocator used
// by the ThreadData array. This lets us host the buffers, allocators, and tries
// associated with a thread by moving the data into the array instead of
// attempting to copy the data to a separately backed set of tries.
static typename std::aligned_storage<
    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
static BufferQueue *BQ = nullptr;
static BufferQueue::Buffer Buffer;
static typename std::aligned_storage<sizeof(ThreadDataAllocator),
                                     alignof(ThreadDataAllocator)>::type
    ThreadDataAllocatorStorage;
static typename std::aligned_storage<sizeof(ThreadDataArray),
                                     alignof(ThreadDataArray)>::type
    ThreadDataArrayStorage;

static ThreadDataAllocator *TDAllocator = nullptr;
static ThreadDataArray *TDArray = nullptr;

using ProfileBufferArray = Array<ProfileBuffer>;
using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;

// These need to be global aligned storage to avoid dynamic initialization. We
// need these to be aligned to allow us to placement new objects into the
// storage, and have pointers to those objects be appropriately aligned.
static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
    ProfileBuffersStorage;
static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
    ProfileBufferArrayAllocatorStorage;

static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
static ProfileBufferArray *ProfileBuffers = nullptr;

// Use a global flag to determine whether the collector implementation has been
// initialized.
static atomic_uint8_t CollectorInitialized{0};

} // namespace

void post(BufferQueue *Q, FunctionCallTrie &&T,
          FunctionCallTrie::Allocators &&A,
          FunctionCallTrie::Allocators::Buffers &&B,
          tid_t TId) XRAY_NEVER_INSTRUMENT {
  DCHECK_NE(Q, nullptr);

  // Bail out early if the collector has not been initialized.
  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
    T.~FunctionCallTrie();
    A.~Allocators();
    Q->releaseBuffer(B.NodeBuffer);
    Q->releaseBuffer(B.RootsBuffer);
    Q->releaseBuffer(B.ShadowStackBuffer);
    Q->releaseBuffer(B.NodeIdPairBuffer);
    B.~Buffers();
    return;
  }

  {
    SpinMutexLock Lock(&GlobalMutex);
    DCHECK_NE(TDAllocator, nullptr);
    DCHECK_NE(TDArray, nullptr);

    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
                               TId) == nullptr) {
      // If we fail to add the data to the array, we should destroy the objects
      // handed us.
      T.~FunctionCallTrie();
      A.~Allocators();
      Q->releaseBuffer(B.NodeBuffer);
      Q->releaseBuffer(B.RootsBuffer);
      Q->releaseBuffer(B.ShadowStackBuffer);
      Q->releaseBuffer(B.NodeIdPairBuffer);
      B.~Buffers();
    }
  }
}

// A PathArray represents the function id's representing a stack trace. In this
// context a path is almost always represented from the leaf function in a call
// stack to a root of the call trie.
using PathArray = Array<int32_t>;

struct ProfileRecord {
  using PathAllocator = typename PathArray::AllocatorType;

  // The Path in this record is the function id's from the leaf to the root of
  // the function call stack as represented from a FunctionCallTrie.
  PathArray Path;
  const FunctionCallTrie::Node *Node;
};

namespace {

using ProfileRecordArray = Array<ProfileRecord>;

// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
// the path(s) and the data associated with the path.
static void
populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
  using StackArray = Array<const FunctionCallTrie::Node *>;
  using StackAllocator = typename StackArray::AllocatorType;
  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
  StackArray DFSStack(StackAlloc);
  for (const auto *R : Trie.getRoots()) {
    DFSStack.Append(R);
    while (!DFSStack.empty()) {
      auto *Node = DFSStack.back();
      DFSStack.trim(1);
      if (Node == nullptr)
        continue;
      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
      if (Record == nullptr)
        return;
      DCHECK_NE(Record, nullptr);

      // Traverse the Node's parents and as we're doing so, get the FIds in
      // the order they appear.
      for (auto N = Node; N != nullptr; N = N->Parent)
        Record->Path.Append(N->FId);
      DCHECK(!Record->Path.empty());

      for (const auto C : Node->Callees)
        DFSStack.Append(C.NodePtr);
    }
  }
}

static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
                             const ProfileRecordArray &ProfileRecords)
    XRAY_NEVER_INSTRUMENT {
  auto NextPtr = static_cast<uint8_t *>(
                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
                 sizeof(Header);
  for (const auto &Record : ProfileRecords) {
    // List of IDs follow:
    for (const auto FId : Record.Path)
      NextPtr =
          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
          sizeof(FId);

    // Add the sentinel here.
    constexpr int32_t SentinelFId = 0;
    NextPtr = static_cast<uint8_t *>(
                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
              sizeof(SentinelFId);

    // Add the node data here.
    NextPtr =
        static_cast<uint8_t *>(internal_memcpy(
            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
        sizeof(Record.Node->CallCount);
    NextPtr = static_cast<uint8_t *>(
                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
                                  sizeof(Record.Node->CumulativeLocalTime))) +
              sizeof(Record.Node->CumulativeLocalTime);
  }

  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
}

} // namespace

void serialize() XRAY_NEVER_INSTRUMENT {
  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
    return;

  SpinMutexLock Lock(&GlobalMutex);

  // Clear out the global ProfileBuffers, if it's not empty.
  for (auto &B : *ProfileBuffers)
    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
  ProfileBuffers->trim(ProfileBuffers->size());

  DCHECK_NE(TDArray, nullptr);
  if (TDArray->empty())
    return;

  // Then repopulate the global ProfileBuffers.
  u32 I = 0;
  auto MaxSize = profilingFlags()->global_allocator_max;
  auto ProfileArena = allocateBuffer(MaxSize);
  if (ProfileArena == nullptr)
    return;

  auto ProfileArenaCleanup = at_scope_exit(
      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });

  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
  if (PathArena == nullptr)
    return;

  auto PathArenaCleanup = at_scope_exit(
      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });

  for (const auto &ThreadTrie : *TDArray) {
    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
    ProfileRecordAllocator PRAlloc(ProfileArena,
                                   profilingFlags()->global_allocator_max);
    ProfileRecord::PathAllocator PathAlloc(
        PathArena, profilingFlags()->global_allocator_max);
    ProfileRecordArray ProfileRecords(PRAlloc);

    // First, we want to compute the amount of space we're going to need. We'll
    // use a local allocator and an __xray::Array<...> to store the intermediary
    // data, then compute the size as we're going along. Then we'll allocate the
    // contiguous space to contain the thread buffer data.
    if (ThreadTrie.FCT.getRoots().empty())
      continue;

    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
    DCHECK(!ThreadTrie.FCT.getRoots().empty());
    DCHECK(!ProfileRecords.empty());

    // Go through each record, to compute the sizes.
    //
    // header size = block size (4 bytes)
    //   + block number (4 bytes)
    //   + thread id (8 bytes)
    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
    //   + call count (8 bytes)
    //   + local time (8 bytes)
    //   + end of record (8 bytes)
    u32 CumulativeSizes = 0;
    for (const auto &Record : ProfileRecords)
      CumulativeSizes += 20 + (4 * Record.Path.size());

    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
    auto B = ProfileBuffers->Append({});
    B->Size = sizeof(Header) + CumulativeSizes;
    B->Data = allocateBuffer(B->Size);
    DCHECK_NE(B->Data, nullptr);
    serializeRecords(B, Header, ProfileRecords);
  }
}

void reset() XRAY_NEVER_INSTRUMENT {
  atomic_store(&CollectorInitialized, 0, memory_order_release);
  SpinMutexLock Lock(&GlobalMutex);

  if (ProfileBuffers != nullptr) {
    // Clear out the profile buffers that have been serialized.
    for (auto &B : *ProfileBuffers)
      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
    ProfileBuffers->trim(ProfileBuffers->size());
    ProfileBuffers = nullptr;
  }

  if (TDArray != nullptr) {
    // Release the resources as required.
    for (auto &TD : *TDArray) {
      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
    }
    // We don't bother destroying the array here because we've already
    // potentially freed the backing store for the array. Instead we're going to
    // reset the pointer to nullptr, and re-use the storage later instead
    // (placement-new'ing into the storage as-is).
    TDArray = nullptr;
  }

  if (TDAllocator != nullptr) {
    TDAllocator->~Allocator();
    TDAllocator = nullptr;
  }

  if (Buffer.Data != nullptr) {
    BQ->releaseBuffer(Buffer);
  }

  if (BQ == nullptr) {
    bool Success = false;
    new (&BufferQueueStorage)
        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
    if (!Success)
      return;
    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
  } else {
    BQ->finalize();

    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
        BufferQueue::ErrorCode::Ok)
      return;
  }

  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
    return;

  new (&ProfileBufferArrayAllocatorStorage)
      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
      &ProfileBufferArrayAllocatorStorage);

  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
  ProfileBuffers =
      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);

  new (&ThreadDataAllocatorStorage)
      ThreadDataAllocator(Buffer.Data, Buffer.Size);
  TDAllocator =
      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);

  atomic_store(&CollectorInitialized, 1, memory_order_release);
}

XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
  SpinMutexLock Lock(&GlobalMutex);

  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
    return {nullptr, 0};

  static pthread_once_t Once = PTHREAD_ONCE_INIT;
  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
      FileHeaderStorage;
  pthread_once(
      &Once, +[]() XRAY_NEVER_INSTRUMENT {
        new (&FileHeaderStorage) XRayProfilingFileHeader{};
      });

  if (UNLIKELY(B.Data == nullptr)) {
    // The first buffer should always contain the file header information.
    auto &FileHeader =
        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
    FileHeader.Timestamp = NanoTime();
    FileHeader.PID = internal_getpid();
    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
  }

  if (UNLIKELY(B.Data == &FileHeaderStorage))
    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};

  BlockHeader Header;
  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
  auto NextBlock = Header.BlockNum + 1;
  if (NextBlock < ProfileBuffers->size())
    return {(*ProfileBuffers)[NextBlock].Data,
            (*ProfileBuffers)[NextBlock].Size};
  return {nullptr, 0};
}

} // namespace profileCollectorService
} // namespace __xray