1 //===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "X86Counter.h"
10
11 // FIXME: Use appropriate wrappers for poll.h and mman.h
12 // to support Windows and remove this linux-only guard.
13 #ifdef __linux__
14 #include "llvm/Support/Endian.h"
15 #include "llvm/Support/Errc.h"
16
17 #ifdef HAVE_LIBPFM
18 #include "perfmon/perf_event.h"
19 #include "perfmon/pfmlib.h"
20 #include "perfmon/pfmlib_perf_event.h"
21 #endif // HAVE_LIBPFM
22
23 #include <atomic>
24 #include <chrono>
25 #include <cstddef>
26 #include <cstdint>
27 #include <limits>
28 #include <memory>
29 #include <vector>
30
31 #include <poll.h>
32 #include <sys/mman.h>
33 #include <unistd.h>
34
35 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
36 namespace llvm {
37 namespace exegesis {
38
39 // Number of entries in the LBR.
40 static constexpr int kLbrEntries = 16;
41 static constexpr size_t kBufferPages = 8;
42 static const size_t kDataBufferSize = kBufferPages * getpagesize();
43
44 // Waits for the LBR perf events.
pollLbrPerfEvent(const int FileDescriptor)45 static int pollLbrPerfEvent(const int FileDescriptor) {
46 struct pollfd PollFd;
47 PollFd.fd = FileDescriptor;
48 PollFd.events = POLLIN;
49 PollFd.revents = 0;
50 return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
51 }
52
53 // Copies the data-buffer into Buf, given the pointer to MMapped.
copyDataBuffer(void * MMappedBuffer,char * Buf,uint64_t Tail,size_t DataSize)54 static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
55 size_t DataSize) {
56 // First page is reserved for perf_event_mmap_page. Data buffer starts on
57 // the next page.
58 char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
59 // The LBR buffer is a cyclic buffer, we copy data to another buffer.
60 uint64_t Offset = Tail % kDataBufferSize;
61 size_t CopySize = kDataBufferSize - Offset;
62 memcpy(Buf, Start + Offset, CopySize);
63 if (CopySize >= DataSize)
64 return;
65
66 memcpy(Buf + CopySize, Start, Offset);
67 return;
68 }
69
70 // Parses the given data-buffer for stats and fill the CycleArray.
71 // If data has been extracted successfully, also modifies the code to jump
72 // out the benchmark loop.
parseDataBuffer(const char * DataBuf,size_t DataSize,const void * From,const void * To,llvm::SmallVector<int64_t,4> * CycleArray)73 static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
74 const void *From, const void *To,
75 llvm::SmallVector<int64_t, 4> *CycleArray) {
76 const char *DataPtr = DataBuf;
77 while (DataPtr < DataBuf + DataSize) {
78 struct perf_event_header Header;
79 memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
80 if (Header.type != PERF_RECORD_SAMPLE) {
81 // Ignores non-sample records.
82 DataPtr += Header.size;
83 continue;
84 }
85 DataPtr += sizeof(Header);
86 uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
87 DataPtr += sizeof(Count);
88
89 struct perf_branch_entry Entry;
90 memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
91
92 // Read the perf_branch_entry array.
93 for (uint64_t i = 0; i < Count; ++i) {
94 const uint64_t BlockStart = From == nullptr
95 ? std::numeric_limits<uint64_t>::min()
96 : reinterpret_cast<uint64_t>(From);
97 const uint64_t BlockEnd = To == nullptr
98 ? std::numeric_limits<uint64_t>::max()
99 : reinterpret_cast<uint64_t>(To);
100
101 if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
102 CycleArray->push_back(Entry.cycles);
103
104 if (i == Count - 1)
105 // We've reached the last entry.
106 return llvm::Error::success();
107
108 // Advance to next entry
109 DataPtr += sizeof(Entry);
110 memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
111 }
112 }
113 return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
114 llvm::errc::io_error);
115 }
116
X86LbrPerfEvent(unsigned SamplingPeriod)117 X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
118 assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
119 EventString = "BR_INST_RETIRED.NEAR_TAKEN";
120 Attr = new perf_event_attr();
121 Attr->size = sizeof(*Attr);
122 Attr->type = PERF_TYPE_RAW;
123 // FIXME This is SKL's encoding. Not sure if it'll change.
124 Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
125 Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
126 // Don't need to specify "USER" because we've already excluded HV and Kernel.
127 Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
128 Attr->sample_period = SamplingPeriod;
129 Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
130 Attr->disabled = 1;
131 Attr->exclude_kernel = 1;
132 Attr->exclude_hv = 1;
133 Attr->read_format = PERF_FORMAT_GROUP;
134
135 FullQualifiedEventString = EventString;
136 }
137
X86LbrCounter(pfm::PerfEvent && NewEvent)138 X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
139 : Counter(std::move(NewEvent)) {
140 // First page is reserved for perf_event_mmap_page. Data buffer starts on
141 // the next page, so we allocate one more page.
142 MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
143 PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
144 if (MMappedBuffer == MAP_FAILED)
145 llvm::errs() << "Failed to mmap buffer.";
146 }
147
~X86LbrCounter()148 X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
149
start()150 void X86LbrCounter::start() {
151 ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
152 }
153
checkLbrSupport()154 llvm::Error X86LbrCounter::checkLbrSupport() {
155 // Do a sample read and check if the results contain non-zero values.
156
157 X86LbrCounter counter(X86LbrPerfEvent(123));
158 counter.start();
159
160 // Prevent the compiler from unrolling the loop and get rid of all the
161 // branches. We need at least 16 iterations.
162 int Sum = 0;
163 int V = 1;
164
165 volatile int *P = &V;
166 auto TimeLimit =
167 std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5);
168
169 for (int I = 0;
170 I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit;
171 ++I) {
172 Sum += *P;
173 }
174
175 counter.stop();
176
177 auto ResultOrError = counter.doReadCounter(nullptr, nullptr);
178 if (ResultOrError)
179 if (!ResultOrError.get().empty())
180 // If there is at least one non-zero entry, then LBR is supported.
181 for (const int64_t &Value : ResultOrError.get())
182 if (Value != 0)
183 return Error::success();
184
185 return llvm::make_error<llvm::StringError>(
186 "LBR format with cycles is not suppported on the host.",
187 llvm::errc::not_supported);
188 }
189
190 llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes) const191 X86LbrCounter::readOrError(StringRef FunctionBytes) const {
192 // Disable the event before reading
193 ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
194
195 // Find the boundary of the function so that we could filter the LBRs
196 // to keep only the relevant records.
197 if (FunctionBytes.empty())
198 return llvm::make_error<llvm::StringError>("Empty function bytes",
199 llvm::errc::invalid_argument);
200 const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
201 const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
202 FunctionBytes.size());
203 return doReadCounter(From, To);
204 }
205
206 llvm::Expected<llvm::SmallVector<int64_t, 4>>
doReadCounter(const void * From,const void * To) const207 X86LbrCounter::doReadCounter(const void *From, const void *To) const {
208 // The max number of time-outs/retries before we give up.
209 static constexpr int kMaxTimeouts = 160;
210
211 // Parses the LBR buffer and fills CycleArray with the sequence of cycle
212 // counts from the buffer.
213 llvm::SmallVector<int64_t, 4> CycleArray;
214 auto DataBuf = std::make_unique<char[]>(kDataBufferSize);
215 int NumTimeouts = 0;
216 int PollResult = 0;
217
218 while (PollResult <= 0) {
219 PollResult = pollLbrPerfEvent(FileDescriptor);
220 if (PollResult > 0)
221 break;
222 if (PollResult == -1)
223 return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
224 llvm::errc::io_error);
225 if (NumTimeouts++ >= kMaxTimeouts)
226 return llvm::make_error<llvm::StringError>(
227 "LBR polling still timed out after max number of attempts.",
228 llvm::errc::device_or_resource_busy);
229 }
230
231 struct perf_event_mmap_page Page;
232 memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
233
234 const uint64_t DataTail = Page.data_tail;
235 const uint64_t DataHead = Page.data_head;
236 // We're supposed to use a barrier after reading data_head.
237 std::atomic_thread_fence(std::memory_order_acq_rel);
238 const size_t DataSize = DataHead - DataTail;
239 if (DataSize > kDataBufferSize)
240 return llvm::make_error<llvm::StringError>(
241 "DataSize larger than buffer size.", llvm::errc::invalid_argument);
242
243 copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
244 llvm::Error error =
245 parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
246 if (!error)
247 return CycleArray;
248 return std::move(error);
249 }
250
251 } // namespace exegesis
252 } // namespace llvm
253
254 #endif // defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
255 #endif // __linux__
256