From b79b6f9cf00e72b21f5629aa3c5d9c1786611ec9 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 15 Apr 2024 09:50:12 +0200 Subject: [PATCH] [BOLT] Use offset deduplication for cold fragments Apply deduplication for uniformity and BAT section size reduction. Changes BAT section size to: - large binary: 39541552 bytes (1.02x original), - medium binary: 3828996 bytes (0.64x), - small binary: 928 bytes (0.65x). Test Plan: Updated bolt-address-translation.test Reviewers: rafaelauler, dcci, ayermolo, JDevlieghere, maksfb Reviewed By: maksfb Pull Request: https://github.com/llvm/llvm-project/pull/87853 --- bolt/docs/BAT.md | 9 +- .../bolt/Profile/BoltAddressTranslation.h | 6 +- bolt/lib/Profile/BoltAddressTranslation.cpp | 86 ++++++++++--------- bolt/test/X86/bolt-address-translation.test | 2 +- 4 files changed, 53 insertions(+), 50 deletions(-) diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md index f23ef1abf8761..7ffb5d7c00816 100644 --- a/bolt/docs/BAT.md +++ b/bolt/docs/BAT.md @@ -81,9 +81,10 @@ Hot indices are delta encoded, implicitly starting at zero. | `FuncHash` | 8b | Function hash for input function | Hot | | `NumBlocks` | ULEB128 | Number of basic blocks in the original function | Hot | | `NumSecEntryPoints` | ULEB128 | Number of secondary entry points in the original function | Hot | +| `ColdInputSkew` | ULEB128 | Skew to apply to all input offsets | Cold | | `NumEntries` | ULEB128 | Number of address translation entries for a function | Both | -| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Hot | -| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Hot | +| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Both | +| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Both | Function header is followed by *Address Translation Table* with `NumEntries` total entries, and *Secondary Entry Points* table with `NumSecEntryPoints` @@ -99,8 +100,8 @@ entry is encoded. Input offsets implicitly start at zero. | `BBHash` | Optional, 8b | Basic block hash in input binary | BB | | `BBIdx` | Optional, Delta, ULEB128 | Basic block index in input binary | BB | -For hot fragments, the table omits the first `EqualElems` input offsets -where the input offset equals output offset. +The table omits the first `EqualElems` input offsets where the input offset +equals output offset. `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source (branch or call instruction). If not set, it signifies a control flow target diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h index eef05e8a0e681..68b993ee363cc 100644 --- a/bolt/include/bolt/Profile/BoltAddressTranslation.h +++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h @@ -149,9 +149,9 @@ class BoltAddressTranslation { /// entries in function address translation map. APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems); - /// Calculate the number of equal offsets (output = input) in the beginning - /// of the function. - size_t getNumEqualOffsets(const MapTy &Map) const; + /// Calculate the number of equal offsets (output = input - skew) in the + /// beginning of the function. + size_t getNumEqualOffsets(const MapTy &Map, uint32_t Skew) const; std::map Maps; diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 0141ce189acda..ac6e3d01f2718 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -153,12 +153,13 @@ APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map, return BitMask; } -size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map) const { +size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map, + uint32_t Skew) const { size_t EqualOffsets = 0; for (const std::pair &KeyVal : Map) { const uint32_t OutputOffset = KeyVal.first; const uint32_t InputOffset = KeyVal.second >> 1; - if (OutputOffset == InputOffset) + if (OutputOffset == InputOffset - Skew) ++EqualOffsets; else break; @@ -196,12 +197,17 @@ void BoltAddressTranslation::writeMaps(std::map &Maps, SecondaryEntryPointsMap.count(Address) ? SecondaryEntryPointsMap[Address].size() : 0; + uint32_t Skew = 0; if (Cold) { auto HotEntryIt = Maps.find(ColdPartSource[Address]); assert(HotEntryIt != Maps.end()); size_t HotIndex = std::distance(Maps.begin(), HotEntryIt); encodeULEB128(HotIndex - PrevIndex, OS); PrevIndex = HotIndex; + // Skew of all input offsets for cold fragments is simply the first input + // offset. + Skew = Map.begin()->second >> 1; + encodeULEB128(Skew, OS); } else { // Function hash size_t BFHash = getBFHash(HotInputAddress); @@ -217,24 +223,21 @@ void BoltAddressTranslation::writeMaps(std::map &Maps, << '\n'); } encodeULEB128(NumEntries, OS); - // For hot fragments only: encode the number of equal offsets - // (output = input) in the beginning of the function. Only encode one offset - // in these cases. - const size_t EqualElems = Cold ? 0 : getNumEqualOffsets(Map); - if (!Cold) { - encodeULEB128(EqualElems, OS); - if (EqualElems) { - const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8; - APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems); - OS.write(reinterpret_cast(BranchEntries.getRawData()), - BranchEntriesBytes); - LLVM_DEBUG({ - dbgs() << "BranchEntries: "; - SmallString<8> BitMaskStr; - BranchEntries.toString(BitMaskStr, 2, false); - dbgs() << BitMaskStr << '\n'; - }); - } + // Encode the number of equal offsets (output = input - skew) in the + // beginning of the function. Only encode one offset in these cases. + const size_t EqualElems = getNumEqualOffsets(Map, Skew); + encodeULEB128(EqualElems, OS); + if (EqualElems) { + const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8; + APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems); + OS.write(reinterpret_cast(BranchEntries.getRawData()), + BranchEntriesBytes); + LLVM_DEBUG({ + dbgs() << "BranchEntries: "; + SmallString<8> BitMaskStr; + BranchEntries.toString(BitMaskStr, 2, false); + dbgs() << BitMaskStr << '\n'; + }); } const BBHashMapTy &BBHashMap = getBBHashMap(HotInputAddress); size_t Index = 0; @@ -315,10 +318,12 @@ void BoltAddressTranslation::parseMaps(std::vector &HotFuncs, uint64_t HotAddress = Cold ? 0 : Address; PrevAddress = Address; uint32_t SecondaryEntryPoints = 0; + uint64_t ColdInputSkew = 0; if (Cold) { HotIndex += DE.getULEB128(&Offset, &Err); HotAddress = HotFuncs[HotIndex]; ColdPartSource.emplace(Address, HotAddress); + ColdInputSkew = DE.getULEB128(&Offset, &Err); } else { HotFuncs.push_back(Address); // Function hash @@ -339,28 +344,25 @@ void BoltAddressTranslation::parseMaps(std::vector &HotFuncs, getULEB128Size(SecondaryEntryPoints))); } const uint32_t NumEntries = DE.getULEB128(&Offset, &Err); - // Equal offsets, hot fragments only. - size_t EqualElems = 0; + // Equal offsets. + const size_t EqualElems = DE.getULEB128(&Offset, &Err); APInt BEBitMask; - if (!Cold) { - EqualElems = DE.getULEB128(&Offset, &Err); - LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n", - EqualElems, getULEB128Size(EqualElems))); - if (EqualElems) { - const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8; - BEBitMask = APInt(alignTo(EqualElems, 8), 0); - LoadIntFromMemory( - BEBitMask, - reinterpret_cast( - DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()), - BranchEntriesBytes); - LLVM_DEBUG({ - dbgs() << "BEBitMask: "; - SmallString<8> BitMaskStr; - BEBitMask.toString(BitMaskStr, 2, false); - dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n"; - }); - } + LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n", EqualElems, + getULEB128Size(EqualElems))); + if (EqualElems) { + const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8; + BEBitMask = APInt(alignTo(EqualElems, 8), 0); + LoadIntFromMemory( + BEBitMask, + reinterpret_cast( + DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()), + BranchEntriesBytes); + LLVM_DEBUG({ + dbgs() << "BEBitMask: "; + SmallString<8> BitMaskStr; + BEBitMask.toString(BitMaskStr, 2, false); + dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n"; + }); } MapTy Map; @@ -375,7 +377,7 @@ void BoltAddressTranslation::parseMaps(std::vector &HotFuncs, PrevAddress = OutputAddress; int64_t InputDelta = 0; if (J < EqualElems) { - InputOffset = (OutputOffset << 1) | BEBitMask[J]; + InputOffset = (OutputOffset + ColdInputSkew << 1) | BEBitMask[J]; } else { InputDelta = DE.getSLEB128(&Offset, &Err); InputOffset += InputDelta; diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test index 63234b4c1d218..e6b21c14077b4 100644 --- a/bolt/test/X86/bolt-address-translation.test +++ b/bolt/test/X86/bolt-address-translation.test @@ -37,7 +37,7 @@ # CHECK: BOLT: 3 out of 7 functions were overwritten. # CHECK: BOLT-INFO: Wrote 6 BAT maps # CHECK: BOLT-INFO: Wrote 3 function and 58 basic block hashes -# CHECK: BOLT-INFO: BAT section size (bytes): 924 +# CHECK: BOLT-INFO: BAT section size (bytes): 928 # # usqrt mappings (hot part). We match against any key (left side containing # the bolted binary offsets) because BOLT may change where it puts instructions