Skip to content

Commit

Permalink
[BOLT] Use offset deduplication for cold fragments
Browse files Browse the repository at this point in the history
Apply deduplication for uniformity and BAT section size reduction.

Changes BAT section size to:
- large binary: 39541552 bytes (1.02x original),
- medium binary: 3828996 bytes (0.64x),
- small binary: 928 bytes (0.65x).

Test Plan: Updated bolt-address-translation.test

Reviewers: rafaelauler, dcci, ayermolo, JDevlieghere, maksfb

Reviewed By: maksfb

Pull Request: llvm#87853
  • Loading branch information
aaupov authored Apr 15, 2024
1 parent 0794298 commit b79b6f9
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 50 deletions.
9 changes: 5 additions & 4 deletions bolt/docs/BAT.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ Hot indices are delta encoded, implicitly starting at zero.
| `FuncHash` | 8b | Function hash for input function | Hot |
| `NumBlocks` | ULEB128 | Number of basic blocks in the original function | Hot |
| `NumSecEntryPoints` | ULEB128 | Number of secondary entry points in the original function | Hot |
| `ColdInputSkew` | ULEB128 | Skew to apply to all input offsets | Cold |
| `NumEntries` | ULEB128 | Number of address translation entries for a function | Both |
| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Hot |
| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Hot |
| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Both |
| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Both |

Function header is followed by *Address Translation Table* with `NumEntries`
total entries, and *Secondary Entry Points* table with `NumSecEntryPoints`
Expand All @@ -99,8 +100,8 @@ entry is encoded. Input offsets implicitly start at zero.
| `BBHash` | Optional, 8b | Basic block hash in input binary | BB |
| `BBIdx` | Optional, Delta, ULEB128 | Basic block index in input binary | BB |

For hot fragments, the table omits the first `EqualElems` input offsets
where the input offset equals output offset.
The table omits the first `EqualElems` input offsets where the input offset
equals output offset.

`BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
(branch or call instruction). If not set, it signifies a control flow target
Expand Down
6 changes: 3 additions & 3 deletions bolt/include/bolt/Profile/BoltAddressTranslation.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,9 @@ class BoltAddressTranslation {
/// entries in function address translation map.
APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems);

/// Calculate the number of equal offsets (output = input) in the beginning
/// of the function.
size_t getNumEqualOffsets(const MapTy &Map) const;
/// Calculate the number of equal offsets (output = input - skew) in the
/// beginning of the function.
size_t getNumEqualOffsets(const MapTy &Map, uint32_t Skew) const;

std::map<uint64_t, MapTy> Maps;

Expand Down
86 changes: 44 additions & 42 deletions bolt/lib/Profile/BoltAddressTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,13 @@ APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
return BitMask;
}

size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map) const {
size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map,
uint32_t Skew) const {
size_t EqualOffsets = 0;
for (const std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
const uint32_t OutputOffset = KeyVal.first;
const uint32_t InputOffset = KeyVal.second >> 1;
if (OutputOffset == InputOffset)
if (OutputOffset == InputOffset - Skew)
++EqualOffsets;
else
break;
Expand Down Expand Up @@ -196,12 +197,17 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
SecondaryEntryPointsMap.count(Address)
? SecondaryEntryPointsMap[Address].size()
: 0;
uint32_t Skew = 0;
if (Cold) {
auto HotEntryIt = Maps.find(ColdPartSource[Address]);
assert(HotEntryIt != Maps.end());
size_t HotIndex = std::distance(Maps.begin(), HotEntryIt);
encodeULEB128(HotIndex - PrevIndex, OS);
PrevIndex = HotIndex;
// Skew of all input offsets for cold fragments is simply the first input
// offset.
Skew = Map.begin()->second >> 1;
encodeULEB128(Skew, OS);
} else {
// Function hash
size_t BFHash = getBFHash(HotInputAddress);
Expand All @@ -217,24 +223,21 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
<< '\n');
}
encodeULEB128(NumEntries, OS);
// For hot fragments only: encode the number of equal offsets
// (output = input) in the beginning of the function. Only encode one offset
// in these cases.
const size_t EqualElems = Cold ? 0 : getNumEqualOffsets(Map);
if (!Cold) {
encodeULEB128(EqualElems, OS);
if (EqualElems) {
const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems);
OS.write(reinterpret_cast<const char *>(BranchEntries.getRawData()),
BranchEntriesBytes);
LLVM_DEBUG({
dbgs() << "BranchEntries: ";
SmallString<8> BitMaskStr;
BranchEntries.toString(BitMaskStr, 2, false);
dbgs() << BitMaskStr << '\n';
});
}
// Encode the number of equal offsets (output = input - skew) in the
// beginning of the function. Only encode one offset in these cases.
const size_t EqualElems = getNumEqualOffsets(Map, Skew);
encodeULEB128(EqualElems, OS);
if (EqualElems) {
const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems);
OS.write(reinterpret_cast<const char *>(BranchEntries.getRawData()),
BranchEntriesBytes);
LLVM_DEBUG({
dbgs() << "BranchEntries: ";
SmallString<8> BitMaskStr;
BranchEntries.toString(BitMaskStr, 2, false);
dbgs() << BitMaskStr << '\n';
});
}
const BBHashMapTy &BBHashMap = getBBHashMap(HotInputAddress);
size_t Index = 0;
Expand Down Expand Up @@ -315,10 +318,12 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
uint64_t HotAddress = Cold ? 0 : Address;
PrevAddress = Address;
uint32_t SecondaryEntryPoints = 0;
uint64_t ColdInputSkew = 0;
if (Cold) {
HotIndex += DE.getULEB128(&Offset, &Err);
HotAddress = HotFuncs[HotIndex];
ColdPartSource.emplace(Address, HotAddress);
ColdInputSkew = DE.getULEB128(&Offset, &Err);
} else {
HotFuncs.push_back(Address);
// Function hash
Expand All @@ -339,28 +344,25 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
getULEB128Size(SecondaryEntryPoints)));
}
const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
// Equal offsets, hot fragments only.
size_t EqualElems = 0;
// Equal offsets.
const size_t EqualElems = DE.getULEB128(&Offset, &Err);
APInt BEBitMask;
if (!Cold) {
EqualElems = DE.getULEB128(&Offset, &Err);
LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n",
EqualElems, getULEB128Size(EqualElems)));
if (EqualElems) {
const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
BEBitMask = APInt(alignTo(EqualElems, 8), 0);
LoadIntFromMemory(
BEBitMask,
reinterpret_cast<const uint8_t *>(
DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()),
BranchEntriesBytes);
LLVM_DEBUG({
dbgs() << "BEBitMask: ";
SmallString<8> BitMaskStr;
BEBitMask.toString(BitMaskStr, 2, false);
dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n";
});
}
LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n", EqualElems,
getULEB128Size(EqualElems)));
if (EqualElems) {
const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
BEBitMask = APInt(alignTo(EqualElems, 8), 0);
LoadIntFromMemory(
BEBitMask,
reinterpret_cast<const uint8_t *>(
DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()),
BranchEntriesBytes);
LLVM_DEBUG({
dbgs() << "BEBitMask: ";
SmallString<8> BitMaskStr;
BEBitMask.toString(BitMaskStr, 2, false);
dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n";
});
}
MapTy Map;

Expand All @@ -375,7 +377,7 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
PrevAddress = OutputAddress;
int64_t InputDelta = 0;
if (J < EqualElems) {
InputOffset = (OutputOffset << 1) | BEBitMask[J];
InputOffset = (OutputOffset + ColdInputSkew << 1) | BEBitMask[J];
} else {
InputDelta = DE.getSLEB128(&Offset, &Err);
InputOffset += InputDelta;
Expand Down
2 changes: 1 addition & 1 deletion bolt/test/X86/bolt-address-translation.test
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# CHECK: BOLT: 3 out of 7 functions were overwritten.
# CHECK: BOLT-INFO: Wrote 6 BAT maps
# CHECK: BOLT-INFO: Wrote 3 function and 58 basic block hashes
# CHECK: BOLT-INFO: BAT section size (bytes): 924
# CHECK: BOLT-INFO: BAT section size (bytes): 928
#
# usqrt mappings (hot part). We match against any key (left side containing
# the bolted binary offsets) because BOLT may change where it puts instructions
Expand Down

0 comments on commit b79b6f9

Please sign in to comment.