//===--- PostingList.cpp - Symbol identifiers storage interface -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "PostingList.h" #include "Iterator.h" #include "Token.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Error.h" #include "llvm/Support/MathExtras.h" namespace clang { namespace clangd { namespace dex { namespace { /// Implements iterator of PostingList chunks. This requires iterating over two /// levels: the first level iterator iterates over the chunks and decompresses /// them on-the-fly when the contents of chunk are to be seen. class ChunkIterator : public Iterator { public: explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks) : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) { if (!Chunks.empty()) { DecompressedChunk = CurrentChunk->decompress(); CurrentID = DecompressedChunk.begin(); } } bool reachedEnd() const override { return CurrentChunk == Chunks.end(); } /// Advances cursor to the next item. void advance() override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); ++CurrentID; normalizeCursor(); } /// Applies binary search to advance cursor to the next item with DocID /// equal or higher than the given one. void advanceTo(DocID ID) override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); if (ID <= peek()) return; advanceToChunk(ID); // Try to find ID within current chunk. CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(), [&](const DocID D) { return D < ID; }); normalizeCursor(); } DocID peek() const override { assert(!reachedEnd() && "Posting List iterator can't peek() at the end."); return *CurrentID; } float consume() override { assert(!reachedEnd() && "Posting List iterator can't consume() at the end."); return 1; } size_t estimateSize() const override { return Chunks.size() * ApproxEntriesPerChunk; } private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { if (Tok != nullptr) return OS << *Tok; OS << '['; const char *Sep = ""; for (const Chunk &C : Chunks) for (const DocID Doc : C.decompress()) { OS << Sep << Doc; Sep = " "; } return OS << ']'; } /// If the cursor is at the end of a chunk, place it at the start of the next /// chunk. void normalizeCursor() { // Invariant is already established if examined chunk is not exhausted. if (CurrentID != std::end(DecompressedChunk)) return; // Advance to next chunk if current one is exhausted. ++CurrentChunk; if (CurrentChunk == Chunks.end()) // Reached the end of PostingList. return; DecompressedChunk = CurrentChunk->decompress(); CurrentID = DecompressedChunk.begin(); } /// Advances CurrentChunk to the chunk which might contain ID. void advanceToChunk(DocID ID) { if ((CurrentChunk != Chunks.end() - 1) && ((CurrentChunk + 1)->Head <= ID)) { CurrentChunk = std::partition_point(CurrentChunk + 1, Chunks.end(), [&](const Chunk &C) { return C.Head < ID; }); --CurrentChunk; DecompressedChunk = CurrentChunk->decompress(); CurrentID = DecompressedChunk.begin(); } } const Token *Tok; llvm::ArrayRef<Chunk> Chunks; /// Iterator over chunks. /// If CurrentChunk is valid, then DecompressedChunk is /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator /// into it. decltype(Chunks)::const_iterator CurrentChunk; llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk; /// Iterator over DecompressedChunk. decltype(DecompressedChunk)::iterator CurrentID; static constexpr size_t ApproxEntriesPerChunk = 15; }; static constexpr size_t BitsPerEncodingByte = 7; /// Writes a variable length DocID into the buffer and updates the buffer size. /// If it doesn't fit, returns false and doesn't write to the buffer. bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) { assert(Delta != 0 && "0 is not a valid PostingList delta."); // Calculate number of bytes Delta encoding would take by examining the // meaningful bits. unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte; if (Width > Payload.size()) return false; do { uint8_t Encoding = Delta & 0x7f; Delta >>= 7; Payload.front() = Delta ? Encoding | 0x80 : Encoding; Payload = Payload.drop_front(); } while (Delta != 0); return true; } /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of /// DocIDs. The compression stores deltas (differences) between subsequent /// DocIDs and encodes these deltas utilizing the least possible number of /// bytes. /// /// Each encoding byte consists of two parts: the first bit (continuation bit) /// indicates whether this is the last byte (0 if this byte is the last) of /// current encoding and seven bytes a piece of DocID (payload). DocID contains /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit /// payloads and one 4-bit payload), but in practice it is expected that gaps /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes. /// In very dense posting lists (with average gaps less than 128) this /// representation would be 4 times more efficient than raw DocID array. /// /// PostingList encoding example: /// /// DocIDs 42 47 7000 /// gaps 5 6958 /// Encoding (raw number) 00000101 10110110 00101110 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) { assert(!Documents.empty() && "Can't encode empty sequence."); std::vector<Chunk> Result; Result.emplace_back(); DocID Last = Result.back().Head = Documents.front(); llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload; for (DocID Doc : Documents.drop_front()) { if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk Result.emplace_back(); Result.back().Head = Doc; RemainingPayload = Result.back().Payload; } Last = Doc; } return std::vector<Chunk>(Result); // no move, shrink-to-fit } /// Reads variable length DocID from the buffer and updates the buffer size. If /// the stream is terminated, return None. llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) { if (Bytes.front() == 0 || Bytes.empty()) return None; DocID Result = 0; bool HasNextByte = true; for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) { assert(Length <= 5 && "Malformed VByte encoding sequence."); // Write meaningful bits to the correct place in the document decoding. Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length); if ((Bytes.front() & 0x80) == 0) HasNextByte = false; Bytes = Bytes.drop_front(); } return Result; } } // namespace llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const { llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head}; llvm::ArrayRef<uint8_t> Bytes(Payload); DocID Delta; for (DocID Current = Head; !Bytes.empty(); Current += Delta) { auto MaybeDelta = readVByte(Bytes); if (!MaybeDelta) break; Delta = *MaybeDelta; Result.push_back(Current + Delta); } return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result}; } PostingList::PostingList(llvm::ArrayRef<DocID> Documents) : Chunks(encodeStream(Documents)) {} std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const { return std::make_unique<ChunkIterator>(Tok, Chunks); } } // namespace dex } // namespace clangd } // namespace clang