From 36fe44dfcccdac54021d48b9a0fa9d754769aab4 Mon Sep 17 00:00:00 2001 From: Gil Rapaport Date: Thu, 8 Nov 2018 09:01:19 +0000 Subject: [PATCH 0001/1581] [LSR] Combine unfolded offset into invariant register LSR reassociates constants as unfolded offsets when the constants fit as immediate add operands, which currently prevents such constants from being combined later with loop invariant registers. This patch modifies GenerateCombinations() to generate a second formula which includes the unfolded offset in the combined loop-invariant register. This commit fixes a bug in the original patch (committed at r345114, reverted at r345123). Differential Revision: https://reviews.llvm.org/D51861 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346390 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 54 ++++++++++--- .../AArch64/small-constant.ll | 75 +++++-------------- .../two-combinations-bug.ll | 55 ++++++++++++++ 3 files changed, 117 insertions(+), 67 deletions(-) create mode 100644 test/Transforms/LoopStrengthReduce/two-combinations-bug.ll diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 857b83da96d..e9c24d61394 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3638,32 +3638,62 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. - if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1) + if (Base.BaseRegs.size() + (Base.Scale == 1) + + (Base.UnfoldedOffset != 0) <= 1) return; // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before // processing the formula. Base.unscale(); - Formula F = Base; - F.BaseRegs.clear(); SmallVector Ops; + Formula NewBase = Base; + NewBase.BaseRegs.clear(); + Type *CombinedIntegerType = nullptr; for (const SCEV *BaseReg : Base.BaseRegs) { if (SE.properlyDominates(BaseReg, L->getHeader()) && - !SE.hasComputableLoopEvolution(BaseReg, L)) + !SE.hasComputableLoopEvolution(BaseReg, L)) { + if (!CombinedIntegerType) + CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType()); Ops.push_back(BaseReg); + } else - F.BaseRegs.push_back(BaseReg); + NewBase.BaseRegs.push_back(BaseReg); } - if (Ops.size() > 1) { - const SCEV *Sum = SE.getAddExpr(Ops); + + // If no register is relevant, we're done. + if (Ops.size() == 0) + return; + + // Utility function for generating the required variants of the combined + // registers. + auto GenerateFormula = [&](const SCEV *Sum) { + Formula F = NewBase; + // TODO: If Sum is zero, it probably means ScalarEvolution missed an // opportunity to fold something. For now, just ignore such cases // rather than proceed with zero in a register. - if (!Sum->isZero()) { - F.BaseRegs.push_back(Sum); - F.canonicalize(*L); - (void)InsertFormula(LU, LUIdx, F); - } + if (Sum->isZero()) + return; + + F.BaseRegs.push_back(Sum); + F.canonicalize(*L); + (void)InsertFormula(LU, LUIdx, F); + }; + + // If we collected at least two registers, generate a formula combining them. + if (Ops.size() > 1) { + SmallVector OpsCopy(Ops); // Don't let SE modify Ops. + GenerateFormula(SE.getAddExpr(OpsCopy)); + } + + // If we have an unfolded offset, generate a formula combining it with the + // registers collected. + if (NewBase.UnfoldedOffset) { + assert(CombinedIntegerType && "Missing a type for the unfolded offset"); + Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, + true)); + NewBase.UnfoldedOffset = 0; + GenerateFormula(SE.getAddExpr(Ops)); } } diff --git a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll index 585759dd178..04ad762df99 100644 --- a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll +++ b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll @@ -2,45 +2,10 @@ ; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s -; LSR doesn't consider bumping a pointer by constants outside the loop when the -; constants fit as immediate add operands. The constants are re-associated as an -; unfolded offset rather than a register and are not combined later with -; loop-invariant registers. For large-enough constants LSR produces better -; solutions for these test cases, with test1 switching from: -; -; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 scale cost, plus 4 imm cost, plus 1 setup cost: -; LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64 -; -7 + reg({(7 + %start),+,1}<%for.body>) -; LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float* -; reg(%arr) + 4*reg({(7 + %start),+,1}<%for.body>) -; -; to: -; -; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost: -; LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64 -; reg({%start,+,1}<%for.body>) -; LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float* -; reg((88888 + %arr)) + 4*reg({%start,+,1}<%for.body>) -; -; and test2 switching from: -; -; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 base add, plus 1 scale cost: -; LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64 -; reg({%start,+,1}<%for.body>) -; LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64 -; reg({%start,+,1}<%for.body>) -; LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float* -; reg(%arr) + 4*reg({%start,+,1}<%for.body>) + imm(28) -; -; to: -; -; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost: -; LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64 -; reg({%start,+,1}<%for.body>) -; LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64 -; reg({%start,+,1}<%for.body>) -; LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float* -; reg((88888 + %arr)) + 4*reg({%start,+,1}<%for.body>) +; Test LSR for giving small constants, which get re-associated as unfolded +; offset, a chance to get combined with loop-invariant registers (same as +; large constants which do not fit as add immediate operands). LSR +; favors here to bump the base pointer outside the loop. ; float test(float *arr, long long start, float threshold) { ; for (long long i = start; i != 0; ++i) { @@ -56,17 +21,16 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold ; CHECK-NEXT: fmov s2, #-7.00000000 ; CHECK-NEXT: cbz x1, .LBB0_5 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: add x8, x1, #7 // =7 +; CHECK-NEXT: add x8, x0, #28 // =28 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr s1, [x0, x8, lsl #2] +; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] ; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: b.gt .LBB0_6 ; CHECK-NEXT: // %bb.3: // %for.cond ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: add x8, x8, #1 // =1 -; CHECK-NEXT: cmp x8, #7 // =7 -; CHECK-NEXT: b.ne .LBB0_2 +; CHECK-NEXT: add x1, x1, #1 // =1 +; CHECK-NEXT: cbnz x1, .LBB0_2 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -104,26 +68,27 @@ define float @test2(float* nocapture readonly %arr, i64 %start, float %threshold ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov s2, #-7.00000000 -; CHECK-NEXT: cbz x1, .LBB1_4 -; CHECK-NEXT: .LBB1_1: // %for.body +; CHECK-NEXT: cbz x1, .LBB1_5 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: add x8, x0, #28 // =28 +; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x8, x0, x1, lsl #2 -; CHECK-NEXT: ldr s1, [x8, #28] +; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] ; CHECK-NEXT: scvtf s3, x1 ; CHECK-NEXT: fadd s3, s3, s0 ; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: b.gt .LBB1_5 -; CHECK-NEXT: // %bb.2: // %for.cond -; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: b.gt .LBB1_6 +; CHECK-NEXT: // %bb.3: // %for.cond +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: add x1, x1, #1 // =1 -; CHECK-NEXT: cbnz x1, .LBB1_1 -; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: cbnz x1, .LBB1_2 +; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: .LBB1_5: ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_5: // %cleanup4 +; CHECK-NEXT: .LBB1_6: // %cleanup4 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret entry: diff --git a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll new file mode 100644 index 00000000000..21917f5959c --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +; This test is adapted from the n-body test of the LLVM test-suite: A bug in +; r345114 caused LSR to generate incorrect code. The test verifies that the +; induction variable generated for the inner loop depends on the induction +; variable of the outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44 = type { double, double, double, double, double, double, double } + +; Function Attrs: nounwind uwtable +define dso_local void @advance(i32 %nbodies, %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44* nocapture %bodies) local_unnamed_addr #0 { +; CHECK-LABEL: @advance( +; CHECK: for.cond.loopexit: +; CHECK: [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV:%.*]], -1 +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]] +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], %for.body3 ], [ [[LSR_IV]], %for.body ] +; CHECK: [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], -1 +; CHECK: [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT2]], 0 +; CHECK: br i1 [[EXITCOND]], label %for.cond.loopexit, label %for.body3 +; +entry: + %wide.trip.count = zext i32 %nbodies to i64 + br label %for.body + +for.cond.loopexit: ; preds = %for.body3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.body + +for.body: ; preds = %for.cond.loopexit, %entry + %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv98 = phi i64 [ %indvars.iv, %for.body ], [ %indvars.iv.next99, %for.body3 ] + %z9 = getelementptr inbounds %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44, %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44* %bodies, i64 %indvars.iv98, i32 2 + %tmp = load double, double* %z9, align 8, !tbaa !0 + %indvars.iv.next99 = add nuw nsw i64 %indvars.iv98, 1 + %exitcond = icmp eq i64 %indvars.iv.next99, %wide.trip.count + br i1 %exitcond, label %for.cond.loopexit, label %for.body3 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = !{!1, !2, i64 16} +!1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48} +!2 = !{!"double", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} -- GitLab From 5f0b3c07f195f95de69cd15219e0062a9e1159a4 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Thu, 8 Nov 2018 10:17:52 +0000 Subject: [PATCH 0002/1581] [MSP430] Fix encodeInstruction() for big endian hosts Reviewers: asl Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D54251 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346391 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp index ba9f7d7a9a5..adf2384f6e9 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp @@ -91,12 +91,11 @@ void MSP430MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, Offset = 2; uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI); - const uint16_t *Words = reinterpret_cast(&BinaryOpCode); size_t WordCount = Size / 2; - for (size_t i = 0; i < WordCount; ++i) { - uint16_t Word = Words[i]; - support::endian::write(OS, Word, support::little); + while (WordCount--) { + support::endian::write(OS, (uint16_t)BinaryOpCode, support::little); + BinaryOpCode >>= 16; } } -- GitLab From d5324f4da3e2bcbcc158033d2774baf91332d087 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 8 Nov 2018 11:45:14 +0000 Subject: [PATCH 0003/1581] [llvm-exegesis] Add a snippet generator to generate snippets to compute ROB sizes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346394 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Assembler.cpp | 14 +++- tools/llvm-exegesis/lib/Assembler.h | 1 + tools/llvm-exegesis/lib/BenchmarkCode.h | 4 ++ tools/llvm-exegesis/lib/BenchmarkResult.h | 2 +- tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 2 +- tools/llvm-exegesis/lib/CMakeLists.txt | 1 + tools/llvm-exegesis/lib/CodeTemplate.h | 4 ++ tools/llvm-exegesis/lib/ROBSize.cpp | 69 ++++++++++++++++++++ tools/llvm-exegesis/lib/ROBSize.h | 36 ++++++++++ tools/llvm-exegesis/lib/RegisterValue.h | 6 ++ tools/llvm-exegesis/lib/SnippetGenerator.cpp | 8 ++- tools/llvm-exegesis/lib/SnippetGenerator.h | 1 + tools/llvm-exegesis/lib/Target.cpp | 54 +++++++++------ tools/llvm-exegesis/lib/Target.h | 13 +++- tools/llvm-exegesis/lib/X86/Target.cpp | 13 ++++ tools/llvm-exegesis/llvm-exegesis.cpp | 4 +- 16 files changed, 204 insertions(+), 28 deletions(-) create mode 100644 tools/llvm-exegesis/lib/ROBSize.cpp create mode 100644 tools/llvm-exegesis/lib/ROBSize.h diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp index 2e3712ce7dc..b0758d4f8e3 100644 --- a/tools/llvm-exegesis/lib/Assembler.cpp +++ b/tools/llvm-exegesis/lib/Assembler.cpp @@ -32,10 +32,19 @@ static constexpr const char FunctionID[] = "foo"; static std::vector generateSnippetSetupCode(const ExegesisTarget &ET, const llvm::MCSubtargetInfo *const MSI, + const unsigned ScratchReg, + llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, bool &IsSnippetSetupComplete) { IsSnippetSetupComplete = true; std::vector Result; + // Copy registers. + for (const unsigned Reg : ScratchRegisterCopies) { + assert(ScratchReg > 0 && "scratch reg copies but no scratch reg"); + const auto CopyRegisterCode = ET.copyReg(*MSI, Reg, ScratchReg); + Result.insert(Result.end(), CopyRegisterCode.begin(), CopyRegisterCode.end()); + } + // Load values in registers. for (const RegisterValue &RV : RegisterInitialValues) { // Load a constant in the register. const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value); @@ -155,6 +164,7 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM) { void assembleToStream(const ExegesisTarget &ET, std::unique_ptr TM, llvm::ArrayRef LiveIns, + llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, llvm::ArrayRef Instructions, llvm::raw_pwrite_stream &AsmStream) { @@ -178,7 +188,7 @@ void assembleToStream(const ExegesisTarget &ET, bool IsSnippetSetupComplete; std::vector Code = - generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(), + generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(), ET.getScratchMemoryRegister(TM->getTargetTriple()), ScratchRegisterCopies, RegisterInitialValues, IsSnippetSetupComplete); Code.insert(Code.end(), Instructions.begin(), Instructions.end()); @@ -199,7 +209,7 @@ void assembleToStream(const ExegesisTarget &ET, llvm::MCContext &MCContext = MMI->getContext(); llvm::legacy::PassManager PM; - llvm::TargetLibraryInfoImpl TLII(llvm::Triple(Module->getTargetTriple())); + llvm::TargetLibraryInfoImpl TLII(Triple(Module->getTargetTriple())); PM.add(new llvm::TargetLibraryInfoWrapperPass(TLII)); llvm::TargetPassConfig *TPC = TM->createPassConfig(PM); diff --git a/tools/llvm-exegesis/lib/Assembler.h b/tools/llvm-exegesis/lib/Assembler.h index ee6bc86f378..2626fbbe9fb 100644 --- a/tools/llvm-exegesis/lib/Assembler.h +++ b/tools/llvm-exegesis/lib/Assembler.h @@ -48,6 +48,7 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM); void assembleToStream(const ExegesisTarget &ET, std::unique_ptr TM, llvm::ArrayRef LiveIns, + llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, llvm::ArrayRef Instructions, llvm::raw_pwrite_stream &AsmStream); diff --git a/tools/llvm-exegesis/lib/BenchmarkCode.h b/tools/llvm-exegesis/lib/BenchmarkCode.h index 38bea2519a6..dda1b29c126 100644 --- a/tools/llvm-exegesis/lib/BenchmarkCode.h +++ b/tools/llvm-exegesis/lib/BenchmarkCode.h @@ -27,6 +27,10 @@ struct BenchmarkCode { // registers initial values. std::vector RegisterInitialValues; + // Before the code is executed some instructions are added to copy the + // scratch register into the specified registers. + std::vector ScratchRegisterCopies; + // We also need to provide the registers that are live on entry for the // assembler to generate proper prologue/epilogue. std::vector LiveIns; diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.h b/tools/llvm-exegesis/lib/BenchmarkResult.h index 773a2e50abc..6df57d21316 100644 --- a/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -58,7 +58,7 @@ struct BenchmarkMeasure { // The result of an instruction benchmark. struct InstructionBenchmark { InstructionBenchmarkKey Key; - enum ModeE { Unknown, Latency, Uops }; + enum ModeE { Unknown, Latency, Uops, ROBSize }; ModeE Mode; std::string CpuName; std::string LLVMTriple; diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 437503f8486..398489e53f8 100644 --- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -168,7 +168,7 @@ BenchmarkRunner::writeObjectFile(const BenchmarkCode &BC, return std::move(E); llvm::raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/); assembleToStream(State.getExegesisTarget(), State.createTargetMachine(), - BC.LiveIns, BC.RegisterInitialValues, Code, OFS); + BC.LiveIns, BC.ScratchRegisterCopies, BC.RegisterInitialValues, Code, OFS); return ResultPath.str(); } diff --git a/tools/llvm-exegesis/lib/CMakeLists.txt b/tools/llvm-exegesis/lib/CMakeLists.txt index 8fdf8b997e0..3c1cf0b8e56 100644 --- a/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/tools/llvm-exegesis/lib/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(LLVMExegesis LlvmState.cpp MCInstrDescView.cpp PerfHelper.cpp + ROBSize.cpp RegisterAliasing.cpp SnippetGenerator.cpp RegisterValue.cpp diff --git a/tools/llvm-exegesis/lib/CodeTemplate.h b/tools/llvm-exegesis/lib/CodeTemplate.h index 4c55487f3d1..2738da67cf3 100644 --- a/tools/llvm-exegesis/lib/CodeTemplate.h +++ b/tools/llvm-exegesis/lib/CodeTemplate.h @@ -17,6 +17,7 @@ #define LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H #include "MCInstrDescView.h" +#include "RegisterValue.h" #include "llvm/ADT/BitmaskEnum.h" namespace llvm { @@ -120,6 +121,9 @@ struct CodeTemplate { std::string Info; // The list of the instructions for this template. std::vector Instructions; + // The list of registers in which to copy the scratch register as a setup + // step. + std::vector ScratchRegisterCopies; // If the template uses the provided scratch memory, the register in which // the pointer to this memory is passed in to the function. unsigned ScratchSpacePointerInReg = 0; diff --git a/tools/llvm-exegesis/lib/ROBSize.cpp b/tools/llvm-exegesis/lib/ROBSize.cpp new file mode 100644 index 00000000000..65d81bd0b71 --- /dev/null +++ b/tools/llvm-exegesis/lib/ROBSize.cpp @@ -0,0 +1,69 @@ +//===-- Uops.cpp ------------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ROBSize.h" + +#include "Assembler.h" +#include "BenchmarkRunner.h" +#include "MCInstrDescView.h" +#include "Target.h" + +namespace llvm { +namespace exegesis { + +ROBSizeSnippetGenerator::~ROBSizeSnippetGenerator() = default; + +llvm::Expected> +ROBSizeSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { + CodeTemplate CT; + // const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; + const auto &ET = State.getExegesisTarget(); + const auto &TM = State.getTargetMachine(); + + CT.ScratchSpacePointerInReg = + ET.getScratchMemoryRegister(TM.getTargetTriple()); + if (CT.ScratchSpacePointerInReg == 0) + return llvm::make_error( + "Infeasible : target does not support memory instructions"); + // ScratchSpaceAliasedRegs = + // &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); + + const unsigned ECX = 50u; // FIXME: pick any available register. + const unsigned EDX = 52u; // FIXME: pick any available register. + CT.ScratchRegisterCopies.push_back(ECX); + CT.ScratchRegisterCopies.push_back(EDX); + + /* + const llvm::TargetInstrInfo *const TII = + State.getSubtargetInfo().getInstrInfo(); MCInst NopInst; + TII->getNoop(NopInst); + */ + Instruction ChaseRegInst(State.getInstrInfo(), State.getRATC(), ET.getChaseRegOpcode()); + //errs() << ChaseRegInst.Variables.size() << "\n"; + assert(ChaseRegInst.Variables.size() >= 2 && "'mov reg, [reg]'' should have at least two variables"); + InstructionTemplate IT1(ChaseRegInst); + IT1.getValueFor(ChaseRegInst.Variables[0]) = MCOperand::createReg(ECX); + ET.fillMemoryOperands(IT1, ECX, 0); + CT.Instructions.push_back(std::move(IT1)); + InstructionTemplate IT2(ChaseRegInst); + IT2.getValueFor(ChaseRegInst.Variables[0]) = MCOperand::createReg(EDX); + ET.fillMemoryOperands(IT2, EDX, 0); + CT.Instructions.push_back(std::move(IT2)); + + // const auto &ReservedRegisters = State.getRATC().reservedRegisters(); + // No tied variables, we pick random values for defs. + llvm::BitVector Defs(State.getRegInfo().getNumRegs()); + CT.Info = + "instruction has no tied variables picking Uses different from defs"; + // CT.Instructions.push_back(std::move(IT)); + return getSingleton(std::move(CT)); +} + +} // namespace exegesis +} // namespace llvm diff --git a/tools/llvm-exegesis/lib/ROBSize.h b/tools/llvm-exegesis/lib/ROBSize.h new file mode 100644 index 00000000000..e02d51b3570 --- /dev/null +++ b/tools/llvm-exegesis/lib/ROBSize.h @@ -0,0 +1,36 @@ +//===-- Uops.h --------------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// A BenchmarkRunner implementation to measure uop decomposition. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H +#define LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H + +#include "BenchmarkRunner.h" +#include "SnippetGenerator.h" + +namespace llvm { +namespace exegesis { + +class ROBSizeSnippetGenerator : public SnippetGenerator { +public: + ROBSizeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {} + ~ROBSizeSnippetGenerator() override; + + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override; +}; + +} // namespace exegesis +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h index 51ea30ac8eb..689e354e241 100644 --- a/tools/llvm-exegesis/lib/RegisterValue.h +++ b/tools/llvm-exegesis/lib/RegisterValue.h @@ -14,6 +14,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H +#define LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H + #include #include @@ -22,6 +25,7 @@ namespace exegesis { // A simple object storing the value for a particular register. struct RegisterValue { + static RegisterValue zero(unsigned Reg) { return {Reg, llvm::APInt()}; } unsigned Register; llvm::APInt Value; }; @@ -45,3 +49,5 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics, } // namespace exegesis } // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index eb6a8577b57..b8c56265fd3 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -56,8 +56,9 @@ SnippetGenerator::generateConfigurations(const Instruction &Instr) const { } if (CT.ScratchSpacePointerInReg) BC.LiveIns.push_back(CT.ScratchSpacePointerInReg); + BC.ScratchRegisterCopies = CT.ScratchRegisterCopies; BC.RegisterInitialValues = - computeRegisterInitialValues(CT.Instructions); + computeRegisterInitialValues(BC.ScratchRegisterCopies, CT.Instructions); Output.push_back(std::move(BC)); } } @@ -67,12 +68,15 @@ SnippetGenerator::generateConfigurations(const Instruction &Instr) const { } std::vector SnippetGenerator::computeRegisterInitialValues( + const std::vector &ScratchRegisterCopies, const std::vector &Instructions) const { // Collect all register uses and create an assignment for each of them. // Ignore memory operands which are handled separately. // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. llvm::BitVector DefinedRegs = State.getRATC().emptyRegisters(); + for (const auto& Reg : ScratchRegisterCopies) + DefinedRegs.set(Reg); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not @@ -91,7 +95,7 @@ std::vector SnippetGenerator::computeRegisterInitialValues( if (Op.isUse()) { const unsigned Reg = GetOpReg(Op); if (Reg > 0 && !DefinedRegs.test(Reg)) { - RIV.push_back(RegisterValue{Reg, llvm::APInt()}); + RIV.push_back(RegisterValue::zero(Reg)); DefinedRegs.set(Reg); } } diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h index 967b273182b..0141d5fd4f9 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -62,6 +62,7 @@ public: // Given a snippet, computes which registers the setup code needs to define. std::vector computeRegisterInitialValues( + const std::vector &ScratchRegisterCopies, const std::vector &Snippet) const; protected: diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp index 06557770418..085518c9a67 100644 --- a/tools/llvm-exegesis/lib/Target.cpp +++ b/tools/llvm-exegesis/lib/Target.cpp @@ -9,6 +9,7 @@ #include "Target.h" #include "Latency.h" +#include "ROBSize.h" #include "Uops.h" namespace llvm { @@ -37,6 +38,31 @@ void ExegesisTarget::registerTarget(ExegesisTarget *Target) { FirstTarget = Target; } +std::unique_ptr +ExegesisTarget::createLatencySnippetGenerator(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +static createROBSizeSnippetGenerator(const LLVMState &State) { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const { + return llvm::make_unique(State); +} + std::unique_ptr ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode, const LLVMState &State) const { @@ -47,6 +73,8 @@ ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode, return createLatencySnippetGenerator(State); case InstructionBenchmark::Uops: return createUopsSnippetGenerator(State); + case InstructionBenchmark::ROBSize: + return createROBSizeSnippetGenerator(State); } return nullptr; } @@ -58,6 +86,7 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode, case InstructionBenchmark::Unknown: return nullptr; case InstructionBenchmark::Latency: + case InstructionBenchmark::ROBSize: return createLatencyBenchmarkRunner(State); case InstructionBenchmark::Uops: return createUopsBenchmarkRunner(State); @@ -65,26 +94,6 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode, return nullptr; } -std::unique_ptr -ExegesisTarget::createLatencySnippetGenerator(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const { - return llvm::make_unique(State); -} - static_assert(std::is_pod::value, "We shouldn't have dynamic initialization here"); const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, 0u}; @@ -123,6 +132,11 @@ private: llvm_unreachable("Not yet implemented"); } + std::vector copyReg(const llvm::MCSubtargetInfo &STI, + unsigned ToReg, unsigned FromReg) const override { + llvm_unreachable("Not yet implemented"); + } + bool matchesArch(llvm::Triple::ArchType Arch) const override { llvm_unreachable("never called"); return false; diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h index b0f0e996173..c4be621b291 100644 --- a/tools/llvm-exegesis/lib/Target.h +++ b/tools/llvm-exegesis/lib/Target.h @@ -76,6 +76,11 @@ public: setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg, const llvm::APInt &Value) const = 0; + // Generates code to copy `FromReg` to `ToReg`. + // Precondition: Registers must be the same size. + virtual std::vector + copyReg(const llvm::MCSubtargetInfo &STI, unsigned ToReg, unsigned FromReg) const = 0; + // Returns the register pointing to scratch memory, or 0 if this target // does not support memory operands. The benchmark function uses the // default calling convention. @@ -83,10 +88,16 @@ public: return 0; } + // Returns the opcode to move the value at `[Reg]` into `Reg`, where `Reg` is + // the from the same register class as getScratchMemoryRegister(). + virtual unsigned getChaseRegOpcode() const { + llvm_unreachable( + "fillMemoryOperands() requires getScratchMemoryRegister() > 0"); + } + // Fills memory operands with references to the address at [Reg] + Offset. virtual void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const { - llvm_unreachable( "fillMemoryOperands() requires getScratchMemoryRegister() > 0"); } diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 618e4d77db4..282fd1db154 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -484,6 +484,19 @@ private: return {}; // Not yet implemented. } + std::vector copyReg(const llvm::MCSubtargetInfo &STI, + unsigned ToReg, + unsigned FromReg) const override { + if (llvm::X86::GR64RegClass.contains(ToReg)) + assert(llvm::X86::GR64RegClass.contains(FromReg) && "registers must be the same size"); + return {llvm::MCInstBuilder(X86::MOV64rr).addReg(ToReg).addReg(FromReg)}; + return {}; // Not yet implemented. + } + + unsigned getChaseRegOpcode() const override { + return X86::MOV64rm; + } + std::unique_ptr createLatencySnippetGenerator(const LLVMState &State) const override { return llvm::make_unique(State); diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp index a28e68ec006..6d15fcfef4e 100644 --- a/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/tools/llvm-exegesis/llvm-exegesis.cpp @@ -63,6 +63,8 @@ static cl::opt "latency", "Instruction Latency"), clEnumValN(exegesis::InstructionBenchmark::Uops, "uops", "Uop Decomposition"), + clEnumValN(exegesis::InstructionBenchmark::ROBSize, + "rob_size", "ROB Size"), // When not asking for a specific benchmark mode, // we'll analyse the results. clEnumValN(exegesis::InstructionBenchmark::Unknown, @@ -201,7 +203,6 @@ public: return; if (CommentText.consume_front("DEFREG")) { // LLVM-EXEGESIS-DEFREF - RegisterValue RegVal; llvm::SmallVector Parts; CommentText.split(Parts, ' ', /*unlimited splits*/ -1, /*do not keep empty strings*/ false); @@ -210,6 +211,7 @@ public: << "\n"; ++InvalidComments; } + RegisterValue RegVal; if (!(RegVal.Register = findRegisterByName(Parts[0].trim()))) { llvm::errs() << "unknown register in 'LLVM-EXEGESIS-DEFREG " << CommentText << "\n"; -- GitLab From b6454d3ba48464429c94240fe52c64244d5e0042 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Thu, 8 Nov 2018 11:51:27 +0000 Subject: [PATCH 0004/1581] Adding Yvan as release test backup for Diana Thanks for offering to help, Yvan! :) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346396 91177308-0d34-0410-b5e6-96231b3b80d8 --- RELEASE_TESTERS.TXT | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT index 462941b35b0..117075c7b69 100644 --- a/RELEASE_TESTERS.TXT +++ b/RELEASE_TESTERS.TXT @@ -41,7 +41,7 @@ E: hans@chromium.org T: x86 O: Windows -N: Diana Picus -E: diana.picus@linaro.org +N: Diana Picus, Yvan Roux +E: diana.picus@linaro.org, yvan.roux@linaro.org T: ARM, AArch64 O: Linux -- GitLab From 36c468a39f987255c0e08b904a2a17b695fd489f Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 8 Nov 2018 11:54:35 +0000 Subject: [PATCH 0005/1581] Return "[IndVars] Smart hard uses detection" The patch has been reverted because it ended up prohibiting propagation of a constant to exit value. For such values, we should skip all checks related to hard uses because propagating a constant is always profitable. Differential Revision: https://reviews.llvm.org/D53691 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346397 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/IndVarSimplify.cpp | 39 +++++++++----- test/Analysis/ScalarEvolution/pr28705.ll | 6 +-- .../IndVarSimplify/dont-recompute.ll | 51 +++++++++++++++++++ .../IndVarSimplify/lrev-existing-umin.ll | 38 ++++++++++++++ 4 files changed, 118 insertions(+), 16 deletions(-) diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index ec51ad71abc..48d8e457ba7 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -145,6 +145,7 @@ class IndVarSimplify { bool canLoopBeDeleted(Loop *L, SmallVector &RewritePhiSet); bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); bool rewriteFirstIterationLoopExitValues(Loop *L); + bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const; bool linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter); @@ -524,6 +525,29 @@ struct RewritePhi { // As a side effect, reduces the amount of IV processing within the loop. //===----------------------------------------------------------------------===// +bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const { + SmallPtrSet Visited; + SmallVector WorkList; + Visited.insert(I); + WorkList.push_back(I); + while (!WorkList.empty()) { + const Instruction *Curr = WorkList.pop_back_val(); + // This use is outside the loop, nothing to do. + if (!L->contains(Curr)) + continue; + // Do we assume it is a "hard" use which will not be eliminated easily? + if (Curr->mayHaveSideEffects()) + return true; + // Otherwise, add all its users to worklist. + for (auto U : Curr->users()) { + auto *UI = cast(U); + if (Visited.insert(UI).second) + WorkList.push_back(UI); + } + } + return false; +} + /// Check to see if this loop has a computable loop-invariant execution count. /// If so, this means that we can compute the final value of any expressions /// that are recurrent in the loop, and substitute the exit values from the loop @@ -598,19 +622,8 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Computing the value outside of the loop brings no benefit if it is // definitely used inside the loop in a way which can not be optimized // away. - if (ExitValue->getSCEVType()>=scMulExpr) { - bool HasHardInternalUses = false; - for (auto *IB : Inst->users()) { - Instruction *UseInstr = cast(IB); - unsigned Opc = UseInstr->getOpcode(); - if (L->contains(UseInstr) && Opc == Instruction::Call) { - HasHardInternalUses = true; - break; - } - } - if (HasHardInternalUses) - continue; - } + if (!isa(ExitValue) && hasHardUserWithinLoop(L, Inst)) + continue; bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst); diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll index 8fbc08e3ca6..9a8487a6c66 100644 --- a/test/Analysis/ScalarEvolution/pr28705.ll +++ b/test/Analysis/ScalarEvolution/pr28705.ll @@ -1,11 +1,11 @@ ; PR28705 ; RUN: opt < %s -indvars -S | FileCheck %s -; Check IndVarSimplify replaces the exitval use of the induction var "%inc.i.i" -; with "%.sroa.speculated + 1". +; Check IndVarSimplify doesn't replace external use of the induction var +; "%inc.i.i" with "%.sroa.speculated + 1" because it is not profitable. ; ; CHECK-LABEL: @foo( -; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1 +; CHECK: %[[EXIT:.+]] = phi i32 [ %inc.i.i, %for.body650 ] ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ] ; define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr { diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll index c87cd6596c6..22087710a9c 100644 --- a/test/Transforms/IndVarSimplify/dont-recompute.ll +++ b/test/Transforms/IndVarSimplify/dont-recompute.ll @@ -123,3 +123,54 @@ for.end: ; preds = %for.body tail call void @func(i32 %soft_use) ret void } + +; CHECK-LABEL: @test5( +define void @test5(i32 %m) nounwind uwtable { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %add = add i32 %a.05, %m + %soft_use = add i32 %add, 123 +; CHECK: tail call void @func(i32 %soft_use) + tail call void @func(i32 %soft_use) + %inc = add nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, 186 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body +; CHECK: for.end: +; CHECK-NOT: mul i32 %m, 186 +; CHECK:%add.lcssa = phi i32 [ %add, %for.body ] +; CHECK-NEXT: tail call void @func(i32 %add.lcssa) + tail call void @func(i32 %add) + ret void +} + +; CHECK-LABEL: @test6( +define void @test6(i32 %m, i32* %p) nounwind uwtable { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %add = add i32 %a.05, %m + %soft_use = add i32 %add, 123 +; CHECK: store i32 %soft_use, i32* %pidx + %pidx = getelementptr i32, i32* %p, i32 %add + store i32 %soft_use, i32* %pidx + %inc = add nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, 186 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body +; CHECK: for.end: +; CHECK-NOT: mul i32 %m, 186 +; CHECK:%add.lcssa = phi i32 [ %add, %for.body ] +; CHECK-NEXT: tail call void @func(i32 %add.lcssa) + tail call void @func(i32 %add) + ret void +} diff --git a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll index 961c9fd944d..fff76675f17 100644 --- a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll +++ b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll @@ -1,5 +1,7 @@ ; RUN: opt -S -indvars < %s | FileCheck %s +; Do not rewrite the user outside the loop because we must keep the instruction +; inside the loop due to store. Rewrite doesn't give us any profit. define void @f(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) { ; CHECK-LABEL: @f( not_zero11.preheader: @@ -22,6 +24,42 @@ not_zero11: %tmp23 = icmp slt i32 %tmp22, %tmp14 br i1 %tmp23, label %not_zero11, label %main.exit.selector +main.exit.selector: +; CHECK-LABEL: main.exit.selector: +; CHECK: %tmp22.lcssa = phi i32 [ %tmp22, %not_zero11 ] +; CHECK: %tmp24 = icmp slt i32 %tmp22.lcssa, %length. + %tmp24 = icmp slt i32 %tmp22, %length.i + br i1 %tmp24, label %not_zero11.postloop, label %leave + +leave: + ret void + +not_zero11.postloop: + ret void +} + +; Rewrite the user outside the loop because there is no hard users inside the loop. +define void @f1(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) { +; CHECK-LABEL: @f1( +not_zero11.preheader: + %tmp13 = icmp ugt i32 %length.i, %length.i.88 + %tmp14 = select i1 %tmp13, i32 %length.i.88, i32 %length.i + %tmp15 = icmp sgt i32 %tmp14, 0 + br i1 %tmp15, label %not_zero11, label %not_zero11.postloop + +not_zero11: + %v_1 = phi i32 [ %tmp22, %not_zero11 ], [ 0, %not_zero11.preheader ] + %tmp16 = zext i32 %v_1 to i64 + %tmp17 = getelementptr inbounds i8, i8* %tmp8, i64 %tmp16 + %tmp18 = load i8, i8* %tmp17, align 1 + %tmp19 = zext i8 %tmp18 to i32 + %tmp20 = or i32 %tmp19, %tmp10 + %tmp21 = trunc i32 %tmp20 to i8 + %addr22 = getelementptr inbounds i8, i8* %tmp12, i64 %tmp16 + %tmp22 = add nuw nsw i32 %v_1, 1 + %tmp23 = icmp slt i32 %tmp22, %tmp14 + br i1 %tmp23, label %not_zero11, label %main.exit.selector + main.exit.selector: ; CHECK-LABEL: main.exit.selector: ; CHECK: %tmp24 = icmp slt i32 %tmp14, %length.i -- GitLab From c3e506bd699243be2b585c4844cc894a369ad4c5 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 8 Nov 2018 12:09:45 +0000 Subject: [PATCH 0006/1581] Revert "[llvm-exegesis] Add a snippet generator to generate snippets to compute ROB sizes." This reverts accidental commit rL346394. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346398 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Assembler.cpp | 14 +--- tools/llvm-exegesis/lib/Assembler.h | 1 - tools/llvm-exegesis/lib/BenchmarkCode.h | 4 -- tools/llvm-exegesis/lib/BenchmarkResult.h | 2 +- tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 2 +- tools/llvm-exegesis/lib/CMakeLists.txt | 1 - tools/llvm-exegesis/lib/CodeTemplate.h | 4 -- tools/llvm-exegesis/lib/ROBSize.cpp | 69 -------------------- tools/llvm-exegesis/lib/ROBSize.h | 36 ---------- tools/llvm-exegesis/lib/RegisterValue.h | 6 -- tools/llvm-exegesis/lib/SnippetGenerator.cpp | 8 +-- tools/llvm-exegesis/lib/SnippetGenerator.h | 1 - tools/llvm-exegesis/lib/Target.cpp | 54 ++++++--------- tools/llvm-exegesis/lib/Target.h | 13 +--- tools/llvm-exegesis/lib/X86/Target.cpp | 13 ---- tools/llvm-exegesis/llvm-exegesis.cpp | 4 +- 16 files changed, 28 insertions(+), 204 deletions(-) delete mode 100644 tools/llvm-exegesis/lib/ROBSize.cpp delete mode 100644 tools/llvm-exegesis/lib/ROBSize.h diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp index b0758d4f8e3..2e3712ce7dc 100644 --- a/tools/llvm-exegesis/lib/Assembler.cpp +++ b/tools/llvm-exegesis/lib/Assembler.cpp @@ -32,19 +32,10 @@ static constexpr const char FunctionID[] = "foo"; static std::vector generateSnippetSetupCode(const ExegesisTarget &ET, const llvm::MCSubtargetInfo *const MSI, - const unsigned ScratchReg, - llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, bool &IsSnippetSetupComplete) { IsSnippetSetupComplete = true; std::vector Result; - // Copy registers. - for (const unsigned Reg : ScratchRegisterCopies) { - assert(ScratchReg > 0 && "scratch reg copies but no scratch reg"); - const auto CopyRegisterCode = ET.copyReg(*MSI, Reg, ScratchReg); - Result.insert(Result.end(), CopyRegisterCode.begin(), CopyRegisterCode.end()); - } - // Load values in registers. for (const RegisterValue &RV : RegisterInitialValues) { // Load a constant in the register. const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value); @@ -164,7 +155,6 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM) { void assembleToStream(const ExegesisTarget &ET, std::unique_ptr TM, llvm::ArrayRef LiveIns, - llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, llvm::ArrayRef Instructions, llvm::raw_pwrite_stream &AsmStream) { @@ -188,7 +178,7 @@ void assembleToStream(const ExegesisTarget &ET, bool IsSnippetSetupComplete; std::vector Code = - generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(), ET.getScratchMemoryRegister(TM->getTargetTriple()), ScratchRegisterCopies, + generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(), RegisterInitialValues, IsSnippetSetupComplete); Code.insert(Code.end(), Instructions.begin(), Instructions.end()); @@ -209,7 +199,7 @@ void assembleToStream(const ExegesisTarget &ET, llvm::MCContext &MCContext = MMI->getContext(); llvm::legacy::PassManager PM; - llvm::TargetLibraryInfoImpl TLII(Triple(Module->getTargetTriple())); + llvm::TargetLibraryInfoImpl TLII(llvm::Triple(Module->getTargetTriple())); PM.add(new llvm::TargetLibraryInfoWrapperPass(TLII)); llvm::TargetPassConfig *TPC = TM->createPassConfig(PM); diff --git a/tools/llvm-exegesis/lib/Assembler.h b/tools/llvm-exegesis/lib/Assembler.h index 2626fbbe9fb..ee6bc86f378 100644 --- a/tools/llvm-exegesis/lib/Assembler.h +++ b/tools/llvm-exegesis/lib/Assembler.h @@ -48,7 +48,6 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM); void assembleToStream(const ExegesisTarget &ET, std::unique_ptr TM, llvm::ArrayRef LiveIns, - llvm::ArrayRef ScratchRegisterCopies, llvm::ArrayRef RegisterInitialValues, llvm::ArrayRef Instructions, llvm::raw_pwrite_stream &AsmStream); diff --git a/tools/llvm-exegesis/lib/BenchmarkCode.h b/tools/llvm-exegesis/lib/BenchmarkCode.h index dda1b29c126..38bea2519a6 100644 --- a/tools/llvm-exegesis/lib/BenchmarkCode.h +++ b/tools/llvm-exegesis/lib/BenchmarkCode.h @@ -27,10 +27,6 @@ struct BenchmarkCode { // registers initial values. std::vector RegisterInitialValues; - // Before the code is executed some instructions are added to copy the - // scratch register into the specified registers. - std::vector ScratchRegisterCopies; - // We also need to provide the registers that are live on entry for the // assembler to generate proper prologue/epilogue. std::vector LiveIns; diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.h b/tools/llvm-exegesis/lib/BenchmarkResult.h index 6df57d21316..773a2e50abc 100644 --- a/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -58,7 +58,7 @@ struct BenchmarkMeasure { // The result of an instruction benchmark. struct InstructionBenchmark { InstructionBenchmarkKey Key; - enum ModeE { Unknown, Latency, Uops, ROBSize }; + enum ModeE { Unknown, Latency, Uops }; ModeE Mode; std::string CpuName; std::string LLVMTriple; diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 398489e53f8..437503f8486 100644 --- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -168,7 +168,7 @@ BenchmarkRunner::writeObjectFile(const BenchmarkCode &BC, return std::move(E); llvm::raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/); assembleToStream(State.getExegesisTarget(), State.createTargetMachine(), - BC.LiveIns, BC.ScratchRegisterCopies, BC.RegisterInitialValues, Code, OFS); + BC.LiveIns, BC.RegisterInitialValues, Code, OFS); return ResultPath.str(); } diff --git a/tools/llvm-exegesis/lib/CMakeLists.txt b/tools/llvm-exegesis/lib/CMakeLists.txt index 3c1cf0b8e56..8fdf8b997e0 100644 --- a/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/tools/llvm-exegesis/lib/CMakeLists.txt @@ -23,7 +23,6 @@ add_library(LLVMExegesis LlvmState.cpp MCInstrDescView.cpp PerfHelper.cpp - ROBSize.cpp RegisterAliasing.cpp SnippetGenerator.cpp RegisterValue.cpp diff --git a/tools/llvm-exegesis/lib/CodeTemplate.h b/tools/llvm-exegesis/lib/CodeTemplate.h index 2738da67cf3..4c55487f3d1 100644 --- a/tools/llvm-exegesis/lib/CodeTemplate.h +++ b/tools/llvm-exegesis/lib/CodeTemplate.h @@ -17,7 +17,6 @@ #define LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H #include "MCInstrDescView.h" -#include "RegisterValue.h" #include "llvm/ADT/BitmaskEnum.h" namespace llvm { @@ -121,9 +120,6 @@ struct CodeTemplate { std::string Info; // The list of the instructions for this template. std::vector Instructions; - // The list of registers in which to copy the scratch register as a setup - // step. - std::vector ScratchRegisterCopies; // If the template uses the provided scratch memory, the register in which // the pointer to this memory is passed in to the function. unsigned ScratchSpacePointerInReg = 0; diff --git a/tools/llvm-exegesis/lib/ROBSize.cpp b/tools/llvm-exegesis/lib/ROBSize.cpp deleted file mode 100644 index 65d81bd0b71..00000000000 --- a/tools/llvm-exegesis/lib/ROBSize.cpp +++ /dev/null @@ -1,69 +0,0 @@ -//===-- Uops.cpp ------------------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ROBSize.h" - -#include "Assembler.h" -#include "BenchmarkRunner.h" -#include "MCInstrDescView.h" -#include "Target.h" - -namespace llvm { -namespace exegesis { - -ROBSizeSnippetGenerator::~ROBSizeSnippetGenerator() = default; - -llvm::Expected> -ROBSizeSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { - CodeTemplate CT; - // const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; - const auto &ET = State.getExegesisTarget(); - const auto &TM = State.getTargetMachine(); - - CT.ScratchSpacePointerInReg = - ET.getScratchMemoryRegister(TM.getTargetTriple()); - if (CT.ScratchSpacePointerInReg == 0) - return llvm::make_error( - "Infeasible : target does not support memory instructions"); - // ScratchSpaceAliasedRegs = - // &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); - - const unsigned ECX = 50u; // FIXME: pick any available register. - const unsigned EDX = 52u; // FIXME: pick any available register. - CT.ScratchRegisterCopies.push_back(ECX); - CT.ScratchRegisterCopies.push_back(EDX); - - /* - const llvm::TargetInstrInfo *const TII = - State.getSubtargetInfo().getInstrInfo(); MCInst NopInst; - TII->getNoop(NopInst); - */ - Instruction ChaseRegInst(State.getInstrInfo(), State.getRATC(), ET.getChaseRegOpcode()); - //errs() << ChaseRegInst.Variables.size() << "\n"; - assert(ChaseRegInst.Variables.size() >= 2 && "'mov reg, [reg]'' should have at least two variables"); - InstructionTemplate IT1(ChaseRegInst); - IT1.getValueFor(ChaseRegInst.Variables[0]) = MCOperand::createReg(ECX); - ET.fillMemoryOperands(IT1, ECX, 0); - CT.Instructions.push_back(std::move(IT1)); - InstructionTemplate IT2(ChaseRegInst); - IT2.getValueFor(ChaseRegInst.Variables[0]) = MCOperand::createReg(EDX); - ET.fillMemoryOperands(IT2, EDX, 0); - CT.Instructions.push_back(std::move(IT2)); - - // const auto &ReservedRegisters = State.getRATC().reservedRegisters(); - // No tied variables, we pick random values for defs. - llvm::BitVector Defs(State.getRegInfo().getNumRegs()); - CT.Info = - "instruction has no tied variables picking Uses different from defs"; - // CT.Instructions.push_back(std::move(IT)); - return getSingleton(std::move(CT)); -} - -} // namespace exegesis -} // namespace llvm diff --git a/tools/llvm-exegesis/lib/ROBSize.h b/tools/llvm-exegesis/lib/ROBSize.h deleted file mode 100644 index e02d51b3570..00000000000 --- a/tools/llvm-exegesis/lib/ROBSize.h +++ /dev/null @@ -1,36 +0,0 @@ -//===-- Uops.h --------------------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// A BenchmarkRunner implementation to measure uop decomposition. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H -#define LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H - -#include "BenchmarkRunner.h" -#include "SnippetGenerator.h" - -namespace llvm { -namespace exegesis { - -class ROBSizeSnippetGenerator : public SnippetGenerator { -public: - ROBSizeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {} - ~ROBSizeSnippetGenerator() override; - - llvm::Expected> - generateCodeTemplates(const Instruction &Instr) const override; -}; - -} // namespace exegesis -} // namespace llvm - -#endif // LLVM_TOOLS_LLVM_EXEGESIS_ROBSIZE_H diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h index 689e354e241..51ea30ac8eb 100644 --- a/tools/llvm-exegesis/lib/RegisterValue.h +++ b/tools/llvm-exegesis/lib/RegisterValue.h @@ -14,9 +14,6 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H -#define LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H - #include #include @@ -25,7 +22,6 @@ namespace exegesis { // A simple object storing the value for a particular register. struct RegisterValue { - static RegisterValue zero(unsigned Reg) { return {Reg, llvm::APInt()}; } unsigned Register; llvm::APInt Value; }; @@ -49,5 +45,3 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics, } // namespace exegesis } // namespace llvm - -#endif // LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index b8c56265fd3..eb6a8577b57 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -56,9 +56,8 @@ SnippetGenerator::generateConfigurations(const Instruction &Instr) const { } if (CT.ScratchSpacePointerInReg) BC.LiveIns.push_back(CT.ScratchSpacePointerInReg); - BC.ScratchRegisterCopies = CT.ScratchRegisterCopies; BC.RegisterInitialValues = - computeRegisterInitialValues(BC.ScratchRegisterCopies, CT.Instructions); + computeRegisterInitialValues(CT.Instructions); Output.push_back(std::move(BC)); } } @@ -68,15 +67,12 @@ SnippetGenerator::generateConfigurations(const Instruction &Instr) const { } std::vector SnippetGenerator::computeRegisterInitialValues( - const std::vector &ScratchRegisterCopies, const std::vector &Instructions) const { // Collect all register uses and create an assignment for each of them. // Ignore memory operands which are handled separately. // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. llvm::BitVector DefinedRegs = State.getRATC().emptyRegisters(); - for (const auto& Reg : ScratchRegisterCopies) - DefinedRegs.set(Reg); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not @@ -95,7 +91,7 @@ std::vector SnippetGenerator::computeRegisterInitialValues( if (Op.isUse()) { const unsigned Reg = GetOpReg(Op); if (Reg > 0 && !DefinedRegs.test(Reg)) { - RIV.push_back(RegisterValue::zero(Reg)); + RIV.push_back(RegisterValue{Reg, llvm::APInt()}); DefinedRegs.set(Reg); } } diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h index 0141d5fd4f9..967b273182b 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -62,7 +62,6 @@ public: // Given a snippet, computes which registers the setup code needs to define. std::vector computeRegisterInitialValues( - const std::vector &ScratchRegisterCopies, const std::vector &Snippet) const; protected: diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp index 085518c9a67..06557770418 100644 --- a/tools/llvm-exegesis/lib/Target.cpp +++ b/tools/llvm-exegesis/lib/Target.cpp @@ -9,7 +9,6 @@ #include "Target.h" #include "Latency.h" -#include "ROBSize.h" #include "Uops.h" namespace llvm { @@ -38,31 +37,6 @@ void ExegesisTarget::registerTarget(ExegesisTarget *Target) { FirstTarget = Target; } -std::unique_ptr -ExegesisTarget::createLatencySnippetGenerator(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -static createROBSizeSnippetGenerator(const LLVMState &State) { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const { - return llvm::make_unique(State); -} - -std::unique_ptr -ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const { - return llvm::make_unique(State); -} - std::unique_ptr ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode, const LLVMState &State) const { @@ -73,8 +47,6 @@ ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode, return createLatencySnippetGenerator(State); case InstructionBenchmark::Uops: return createUopsSnippetGenerator(State); - case InstructionBenchmark::ROBSize: - return createROBSizeSnippetGenerator(State); } return nullptr; } @@ -86,7 +58,6 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode, case InstructionBenchmark::Unknown: return nullptr; case InstructionBenchmark::Latency: - case InstructionBenchmark::ROBSize: return createLatencyBenchmarkRunner(State); case InstructionBenchmark::Uops: return createUopsBenchmarkRunner(State); @@ -94,6 +65,26 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode, return nullptr; } +std::unique_ptr +ExegesisTarget::createLatencySnippetGenerator(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const { + return llvm::make_unique(State); +} + +std::unique_ptr +ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const { + return llvm::make_unique(State); +} + static_assert(std::is_pod::value, "We shouldn't have dynamic initialization here"); const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, 0u}; @@ -132,11 +123,6 @@ private: llvm_unreachable("Not yet implemented"); } - std::vector copyReg(const llvm::MCSubtargetInfo &STI, - unsigned ToReg, unsigned FromReg) const override { - llvm_unreachable("Not yet implemented"); - } - bool matchesArch(llvm::Triple::ArchType Arch) const override { llvm_unreachable("never called"); return false; diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h index c4be621b291..b0f0e996173 100644 --- a/tools/llvm-exegesis/lib/Target.h +++ b/tools/llvm-exegesis/lib/Target.h @@ -76,11 +76,6 @@ public: setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg, const llvm::APInt &Value) const = 0; - // Generates code to copy `FromReg` to `ToReg`. - // Precondition: Registers must be the same size. - virtual std::vector - copyReg(const llvm::MCSubtargetInfo &STI, unsigned ToReg, unsigned FromReg) const = 0; - // Returns the register pointing to scratch memory, or 0 if this target // does not support memory operands. The benchmark function uses the // default calling convention. @@ -88,16 +83,10 @@ public: return 0; } - // Returns the opcode to move the value at `[Reg]` into `Reg`, where `Reg` is - // the from the same register class as getScratchMemoryRegister(). - virtual unsigned getChaseRegOpcode() const { - llvm_unreachable( - "fillMemoryOperands() requires getScratchMemoryRegister() > 0"); - } - // Fills memory operands with references to the address at [Reg] + Offset. virtual void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const { + llvm_unreachable( "fillMemoryOperands() requires getScratchMemoryRegister() > 0"); } diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 282fd1db154..618e4d77db4 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -484,19 +484,6 @@ private: return {}; // Not yet implemented. } - std::vector copyReg(const llvm::MCSubtargetInfo &STI, - unsigned ToReg, - unsigned FromReg) const override { - if (llvm::X86::GR64RegClass.contains(ToReg)) - assert(llvm::X86::GR64RegClass.contains(FromReg) && "registers must be the same size"); - return {llvm::MCInstBuilder(X86::MOV64rr).addReg(ToReg).addReg(FromReg)}; - return {}; // Not yet implemented. - } - - unsigned getChaseRegOpcode() const override { - return X86::MOV64rm; - } - std::unique_ptr createLatencySnippetGenerator(const LLVMState &State) const override { return llvm::make_unique(State); diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp index 6d15fcfef4e..a28e68ec006 100644 --- a/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/tools/llvm-exegesis/llvm-exegesis.cpp @@ -63,8 +63,6 @@ static cl::opt "latency", "Instruction Latency"), clEnumValN(exegesis::InstructionBenchmark::Uops, "uops", "Uop Decomposition"), - clEnumValN(exegesis::InstructionBenchmark::ROBSize, - "rob_size", "ROB Size"), // When not asking for a specific benchmark mode, // we'll analyse the results. clEnumValN(exegesis::InstructionBenchmark::Unknown, @@ -203,6 +201,7 @@ public: return; if (CommentText.consume_front("DEFREG")) { // LLVM-EXEGESIS-DEFREF + RegisterValue RegVal; llvm::SmallVector Parts; CommentText.split(Parts, ' ', /*unlimited splits*/ -1, /*do not keep empty strings*/ false); @@ -211,7 +210,6 @@ public: << "\n"; ++InvalidComments; } - RegisterValue RegVal; if (!(RegVal.Register = findRegisterByName(Parts[0].trim()))) { llvm::errs() << "unknown register in 'LLVM-EXEGESIS-DEFREG " << CommentText << "\n"; -- GitLab From 88a516ae3484f2673d7706b2a7f75052def307a7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Nov 2018 12:14:10 +0000 Subject: [PATCH 0007/1581] [X86][AVX] Tidyup prefixes and regenerate interleaved tests Share common AVX prefix and split off AVX2OR512 prefix instead git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346399 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/x86-interleaved-access.ll | 287 ++++++++------------- 1 file changed, 103 insertions(+), 184 deletions(-) diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 41d69e544aa..6d5099fa593 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -1,26 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { -; AVX1-LABEL: load_factorf64_4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rdi), %ymm0 -; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] -; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: retq -; ; AVX-LABEL: load_factorf64_4: ; AVX: # %bb.0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -49,21 +32,6 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { } define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { -; AVX1-LABEL: load_factorf64_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rdi), %ymm0 -; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0 -; AVX1-NEXT: retq -; ; AVX-LABEL: load_factorf64_2: ; AVX: # %bb.0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -86,16 +54,6 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { } define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { -; AVX1-LABEL: load_factorf64_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rdi), %ymm0 -; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; ; AVX-LABEL: load_factorf64_1: ; AVX: # %bb.0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -140,24 +98,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX-LABEL: load_factori64_4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX-NEXT: vmovdqu 64(%rdi), %ymm2 -; AVX-NEXT: vmovdqu 96(%rdi), %ymm3 -; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] -; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] -; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX2OR512-LABEL: load_factori64_4: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3 +; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] +; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] +; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0 +; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2OR512-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> @@ -459,33 +417,33 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX-LABEL: interleaved_load_vf8_i8_stride4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX-NEXT: vpaddw %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride4: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2OR512-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2OR512-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] +; AVX2OR512-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2OR512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2OR512-NEXT: vpaddw %xmm1, %xmm4, %xmm1 +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] +; AVX2OR512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] +; AVX2OR512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2OR512-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX2OR512-NEXT: vzeroupper +; AVX2OR512-NEXT: retq %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> @@ -981,21 +939,21 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX-LABEL: interleaved_store_vf8_i8_stride4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: vmovdqa %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2OR512-LABEL: interleaved_store_vf8_i8_stride4: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2OR512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2OR512-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2OR512-NEXT: vzeroupper +; AVX2OR512-NEXT: retq %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> @@ -1050,29 +1008,29 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX-LABEL: interleaved_load_vf32_i8_stride3: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] -; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] -; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 -; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] -; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] +; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2OR512-NEXT: retq %wide.vec = load <96 x i8>, <96 x i8>* %ptr %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> @@ -1083,28 +1041,6 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ } define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ -; AVX1-LABEL: interleaved_load_vf16_i8_stride3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; ; AVX-LABEL: interleaved_load_vf16_i8_stride3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 @@ -1154,23 +1090,23 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX-LABEL: interleaved_load_vf8_i8_stride3: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride3: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] +; AVX2OR512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX2OR512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] +; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX2OR512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX2OR512-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX2OR512-NEXT: vzeroupper +; AVX2OR512-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> @@ -1181,23 +1117,6 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ } define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) { -; AVX1-LABEL: interleaved_store_vf8_i8_stride3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, (%rdi) -; AVX1-NEXT: retq -; ; AVX-LABEL: interleaved_store_vf8_i8_stride3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -- GitLab From 16cd808d487f918e631f2beb822d1909dfdfd8a2 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 8 Nov 2018 12:37:56 +0000 Subject: [PATCH 0008/1581] [llvm-exegesis][NFC] Add missing header guard + cosmetics. Reviewers: gchatelet Reviewed By: gchatelet Subscribers: tschuett, llvm-commits Differential Revision: https://reviews.llvm.org/D54252 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346400 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/RegisterValue.h | 6 ++++++ tools/llvm-exegesis/lib/SnippetGenerator.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h index 51ea30ac8eb..689e354e241 100644 --- a/tools/llvm-exegesis/lib/RegisterValue.h +++ b/tools/llvm-exegesis/lib/RegisterValue.h @@ -14,6 +14,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H +#define LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H + #include #include @@ -22,6 +25,7 @@ namespace exegesis { // A simple object storing the value for a particular register. struct RegisterValue { + static RegisterValue zero(unsigned Reg) { return {Reg, llvm::APInt()}; } unsigned Register; llvm::APInt Value; }; @@ -45,3 +49,5 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics, } // namespace exegesis } // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index eb6a8577b57..88ba315548d 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -91,7 +91,7 @@ std::vector SnippetGenerator::computeRegisterInitialValues( if (Op.isUse()) { const unsigned Reg = GetOpReg(Op); if (Reg > 0 && !DefinedRegs.test(Reg)) { - RIV.push_back(RegisterValue{Reg, llvm::APInt()}); + RIV.push_back(RegisterValue::zero(Reg)); DefinedRegs.set(Reg); } } -- GitLab From 10c750cc3f45be8df97bc467af867262bbff0aeb Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 8 Nov 2018 13:02:10 +0000 Subject: [PATCH 0009/1581] [ARM] Enable spilling of the hGPR register class in Thumb2 Generalize code in Thumb2InstrInfo::storeRegToStackSlot() and loadRegToStackSlot() to allow the GPR class or any of its sub-classes (including hGPR) to be stored/loaded by ARM::t2STRi12/ARM::t2LDRi12. Differential Revision: https://reviews.llvm.org/D51927 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346401 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/Thumb2InstrInfo.cpp | 8 ++--- test/CodeGen/Thumb2/high-reg-spill.mir | 50 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 test/CodeGen/Thumb2/high-reg-spill.mir diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 1a91a703065..d567d333904 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -146,9 +146,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2STRi12)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) @@ -190,9 +188,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) .addFrameIndex(FI) .addImm(0) diff --git a/test/CodeGen/Thumb2/high-reg-spill.mir b/test/CodeGen/Thumb2/high-reg-spill.mir new file mode 100644 index 00000000000..d9bfdcafa38 --- /dev/null +++ b/test/CodeGen/Thumb2/high-reg-spill.mir @@ -0,0 +1,50 @@ +# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s + +# This test examines register allocation and spilling with Fast Register +# Allocator. The test uses inline assembler that requests an input variable to +# be loaded in a high register but at the same time has r12 marked as clobbered. +# The allocator initially satisfies the load request by selecting r12 but then +# needs to spill this register when it reaches the INLINEASM instruction and +# notices its clobber definition. +# +# The test checks that the compiler is able to spill a register from the hGPR +# class in Thumb2 by inserting the t2STRi12/t2LDRi12 instructions. + +--- | + ; ModuleID = 'test.ll' + source_filename = "test.c" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv7m-none-unknown-eabi" + + define dso_local void @constraint_h() { + entry: + %i = alloca i32, align 4 + %0 = load i32, i32* %i, align 4 + call void asm sideeffect "@ $0", "h,~{r12}"(i32 %0) + ret void + } + +... +--- +name: constraint_h +tracksRegLiveness: true +registers: + - { id: 0, class: hgpr } + - { id: 1, class: tgpr } +stack: + - { id: 0, name: i, size: 4, alignment: 4, stack-id: 0, local-offset: -4 } +body: | + bb.0.entry: + %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i) + %0:hgpr = COPY %1 + INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r12 + tBX_RET 14, $noreg + +... +# CHECK: bb.0.entry: +# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i) +# CHECK-NEXT: renamable $r12 = COPY killed renamable $r0 +# CHECK-NEXT: t2STRi12 killed $r12, %stack.1, 0, 14, $noreg :: (store 4 into %stack.1) +# CHECK-NEXT: $r8 = t2LDRi12 %stack.1, 0, 14, $noreg :: (load 4 from %stack.1) +# CHECK-NEXT: INLINEASM &"@ $0", 1, 589833, killed renamable $r8, 12, implicit-def early-clobber $r12 +# CHECK-NEXT: tBX_RET 14, $noreg -- GitLab From 1f494dbf92fe425adbabfa20b505b626f0235336 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Nov 2018 14:07:17 +0000 Subject: [PATCH 0010/1581] [X86][SSE] Add PR39387 shuffle test case git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346402 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-shuffle-128-v16.ll | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index bf34c0332dd..e9ccd7177cf 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -579,6 +579,72 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15( ret <16 x i8> %shuffle } +; PR39387 +define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1OR2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] +; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[11,12,13,14,14],zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm0[1,1,2,3,4] +; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq + %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %1 +} + ; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780 define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) { -- GitLab From 65448894be1563b5a49c6b61ceaf6976d4e5c999 Mon Sep 17 00:00:00 2001 From: Alexandre Ganea Date: Thu, 8 Nov 2018 14:42:37 +0000 Subject: [PATCH 0011/1581] [LLD] Fix Microsoft precompiled headers cross-compile on Linux Differential revision: https://reviews.llvm.org/D54122 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346403 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/DebugInfo/PDB/GenericError.h | 1 + lib/DebugInfo/PDB/GenericError.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h index 7b5a8529596..997f13f5f30 100644 --- a/include/llvm/DebugInfo/PDB/GenericError.h +++ b/include/llvm/DebugInfo/PDB/GenericError.h @@ -21,6 +21,7 @@ enum class pdb_error_code { dia_sdk_not_present, dia_failed_loading, signature_out_of_date, + external_cmdline_ref, unspecified, }; } // namespace pdb diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp index 5f5ff69fe3f..256952073e8 100644 --- a/lib/DebugInfo/PDB/GenericError.cpp +++ b/lib/DebugInfo/PDB/GenericError.cpp @@ -34,6 +34,8 @@ public: return "The PDB file path is an invalid UTF8 sequence."; case pdb_error_code::signature_out_of_date: return "The signature does not match; the file(s) might be out of date."; + case pdb_error_code::external_cmdline_ref: + return "The path to this file must be provided on the command-line."; } llvm_unreachable("Unrecognized generic_error_code"); } -- GitLab From 00ded67a12534e948fa334163921a229eb42df48 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Thu, 8 Nov 2018 14:48:56 +0000 Subject: [PATCH 0012/1581] [NFC][BdVer2] Tests for load and store throughput (PR39465) During review it was noted that while it appears that the Piledriver can do two [consecutive] loads per cycle, it can only do one store per cycle. It was suggested that the sched model incorrectly models that, but it was opted to fix this afterwards. These tests show that the two consecutive loads are modelled correctly, and one consecutive stores is not modelled incorrectly. Unless i'm missing the point. https://bugs.llvm.org/show_bug.cgi?id=39465 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346404 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm-mca/X86/BdVer2/load-throughput.s | 604 +++++++++++++++++ .../llvm-mca/X86/BdVer2/store-throughput.s | 605 ++++++++++++++++++ 2 files changed, 1209 insertions(+) create mode 100644 test/tools/llvm-mca/X86/BdVer2/load-throughput.s create mode 100644 test/tools/llvm-mca/X86/BdVer2/store-throughput.s diff --git a/test/tools/llvm-mca/X86/BdVer2/load-throughput.s b/test/tools/llvm-mca/X86/BdVer2/load-throughput.s new file mode 100644 index 00000000000..d8083d49874 --- /dev/null +++ b/test/tools/llvm-mca/X86/BdVer2/load-throughput.s @@ -0,0 +1,604 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s + +# LLVM-MCA-BEGIN +movb (%rax), %spl +movb (%rcx), %bpl +movb (%rdx), %sil +movb (%rbx), %dil +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movw (%rax), %sp +movw (%rcx), %bp +movw (%rdx), %si +movw (%rbx), %di +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movl (%rax), %esp +movl (%rcx), %ebp +movl (%rdx), %esi +movl (%rbx), %edi +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movq (%rax), %rsp +movq (%rcx), %rbp +movq (%rdx), %rsi +movq (%rbx), %rdi +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movd (%rax), %mm0 +movd (%rcx), %mm1 +movd (%rdx), %mm2 +movd (%rbx), %mm3 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movaps (%rax), %xmm0 +movaps (%rcx), %xmm1 +movaps (%rdx), %xmm2 +movaps (%rbx), %xmm3 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +vmovaps (%rax), %ymm0 +vmovaps (%rcx), %ymm1 +vmovaps (%rdx), %ymm2 +vmovaps (%rbx), %ymm3 +# LLVM-MCA-END + +# CHECK: [0] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movb (%rax), %spl +# CHECK-NEXT: 1 5 0.50 * movb (%rcx), %bpl +# CHECK-NEXT: 1 5 0.50 * movb (%rdx), %sil +# CHECK-NEXT: 1 5 0.50 * movb (%rbx), %dil + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb (%rax), %spl +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movb (%rcx), %bpl +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movb (%rdx), %sil +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb (%rbx), %dil + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movb (%rax), %spl +# CHECK-NEXT: [0,1] DeeeeeER. movb (%rcx), %bpl +# CHECK-NEXT: [0,2] D=eeeeeER movb (%rdx), %sil +# CHECK-NEXT: [0,3] D=eeeeeER movb (%rbx), %dil + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb (%rax), %spl +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movb (%rbx), %dil + +# CHECK: [1] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movw (%rax), %sp +# CHECK-NEXT: 1 5 0.50 * movw (%rcx), %bp +# CHECK-NEXT: 1 5 0.50 * movw (%rdx), %si +# CHECK-NEXT: 1 5 0.50 * movw (%rbx), %di + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw (%rax), %sp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movw (%rcx), %bp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movw (%rdx), %si +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw (%rbx), %di + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movw (%rax), %sp +# CHECK-NEXT: [0,1] DeeeeeER. movw (%rcx), %bp +# CHECK-NEXT: [0,2] D=eeeeeER movw (%rdx), %si +# CHECK-NEXT: [0,3] D=eeeeeER movw (%rbx), %di + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw (%rax), %sp +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movw (%rbx), %di + +# CHECK: [2] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movl (%rax), %esp +# CHECK-NEXT: 1 5 0.50 * movl (%rcx), %ebp +# CHECK-NEXT: 1 5 0.50 * movl (%rdx), %esi +# CHECK-NEXT: 1 5 0.50 * movl (%rbx), %edi + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl (%rax), %esp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movl (%rcx), %ebp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movl (%rdx), %esi +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl (%rbx), %edi + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movl (%rax), %esp +# CHECK-NEXT: [0,1] DeeeeeER. movl (%rcx), %ebp +# CHECK-NEXT: [0,2] D=eeeeeER movl (%rdx), %esi +# CHECK-NEXT: [0,3] D=eeeeeER movl (%rbx), %edi + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl (%rax), %esp +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movl (%rbx), %edi + +# CHECK: [3] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movq (%rax), %rsp +# CHECK-NEXT: 1 5 0.50 * movq (%rcx), %rbp +# CHECK-NEXT: 1 5 0.50 * movq (%rdx), %rsi +# CHECK-NEXT: 1 5 0.50 * movq (%rbx), %rdi + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq (%rax), %rsp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movq (%rcx), %rbp +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - movq (%rdx), %rsi +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq (%rbx), %rdi + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movq (%rax), %rsp +# CHECK-NEXT: [0,1] DeeeeeER. movq (%rcx), %rbp +# CHECK-NEXT: [0,2] D=eeeeeER movq (%rdx), %rsi +# CHECK-NEXT: [0,3] D=eeeeeER movq (%rbx), %rdi + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq (%rax), %rsp +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movq (%rbx), %rdi + +# CHECK: [4] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movd (%rax), %mm0 +# CHECK-NEXT: 1 5 0.50 * movd (%rcx), %mm1 +# CHECK-NEXT: 1 5 0.50 * movd (%rdx), %mm2 +# CHECK-NEXT: 1 5 0.50 * movd (%rbx), %mm3 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - 2.00 2.00 - - 2.00 2.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - 1.00 - - - 1.00 - - - - movd (%rax), %mm0 +# CHECK-NEXT: 1.00 - - - - - - - - - 1.00 - - - 1.00 - - - - - movd (%rcx), %mm1 +# CHECK-NEXT: 1.00 - - - - - - - - - 1.00 - - - - 1.00 - - - - movd (%rdx), %mm2 +# CHECK-NEXT: - 1.00 - - - - - - - - - 1.00 - - 1.00 - - - - - movd (%rbx), %mm3 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movd (%rax), %mm0 +# CHECK-NEXT: [0,1] DeeeeeER. movd (%rcx), %mm1 +# CHECK-NEXT: [0,2] D=eeeeeER movd (%rdx), %mm2 +# CHECK-NEXT: [0,3] D=eeeeeER movd (%rbx), %mm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd (%rax), %mm0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1 +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2 +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movd (%rbx), %mm3 + +# CHECK: [5] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 0.50 * movaps (%rax), %xmm0 +# CHECK-NEXT: 1 5 0.50 * movaps (%rcx), %xmm1 +# CHECK-NEXT: 1 5 0.50 * movaps (%rdx), %xmm2 +# CHECK-NEXT: 1 5 0.50 * movaps (%rbx), %xmm3 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - 2.00 2.00 - - - - 2.00 2.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - 1.00 - - - - - 1.00 - - - - movaps (%rax), %xmm0 +# CHECK-NEXT: 1.00 - - - - - - - 1.00 - - - - - 1.00 - - - - - movaps (%rcx), %xmm1 +# CHECK-NEXT: 1.00 - - - - - - - 1.00 - - - - - - 1.00 - - - - movaps (%rdx), %xmm2 +# CHECK-NEXT: - 1.00 - - - - - - - 1.00 - - - - 1.00 - - - - - movaps (%rbx), %xmm3 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. movaps (%rax), %xmm0 +# CHECK-NEXT: [0,1] DeeeeeER. movaps (%rcx), %xmm1 +# CHECK-NEXT: [0,2] D=eeeeeER movaps (%rdx), %xmm2 +# CHECK-NEXT: [0,3] D=eeeeeER movaps (%rbx), %xmm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps (%rax), %xmm0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1 +# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2 +# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movaps (%rbx), %xmm3 + +# CHECK: [6] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 207 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 3.86 +# CHECK-NEXT: IPC: 1.93 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 5 0.50 * vmovaps (%rax), %ymm0 +# CHECK-NEXT: 2 5 0.50 * vmovaps (%rcx), %ymm1 +# CHECK-NEXT: 2 5 0.50 * vmovaps (%rdx), %ymm2 +# CHECK-NEXT: 2 5 0.50 * vmovaps (%rbx), %ymm3 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: 2.00 2.00 - - - - - - 2.00 2.00 - - - - 2.00 2.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - 1.00 - - - - - 1.00 - - - - vmovaps (%rax), %ymm0 +# CHECK-NEXT: 1.00 - - - - - - - 1.00 - - - - - 1.00 - - - - - vmovaps (%rcx), %ymm1 +# CHECK-NEXT: 1.00 - - - - - - - 1.00 - - - - - - 1.00 - - - - vmovaps (%rdx), %ymm2 +# CHECK-NEXT: - 1.00 - - - - - - - 1.00 - - - - 1.00 - - - - - vmovaps (%rbx), %ymm3 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 012345678 + +# CHECK: [0,0] DeeeeeER. vmovaps (%rax), %ymm0 +# CHECK-NEXT: [0,1] DeeeeeER. vmovaps (%rcx), %ymm1 +# CHECK-NEXT: [0,2] .DeeeeeER vmovaps (%rdx), %ymm2 +# CHECK-NEXT: [0,3] .DeeeeeER vmovaps (%rbx), %ymm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rax), %ymm0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vmovaps (%rcx), %ymm1 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 vmovaps (%rdx), %ymm2 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 vmovaps (%rbx), %ymm3 diff --git a/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/test/tools/llvm-mca/X86/BdVer2/store-throughput.s new file mode 100644 index 00000000000..43c2d1f7e64 --- /dev/null +++ b/test/tools/llvm-mca/X86/BdVer2/store-throughput.s @@ -0,0 +1,605 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s + +# LLVM-MCA-BEGIN +movb %spl, (%rax) +movb %bpl, (%rcx) +movb %sil, (%rdx) +movb %dil, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movw %sp, (%rax) +movw %bp, (%rcx) +movw %si, (%rdx) +movw %di, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movl %esp, (%rax) +movl %ebp, (%rcx) +movl %esi, (%rdx) +movl %edi, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movq %rsp, (%rax) +movq %rbp, (%rcx) +movq %rsi, (%rdx) +movq %rdi, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movd %mm0, (%rax) +movd %mm1, (%rcx) +movd %mm2, (%rdx) +movd %mm3, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +movaps %xmm0, (%rax) +movaps %xmm1, (%rcx) +movaps %xmm2, (%rdx) +movaps %xmm3, (%rbx) +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +vmovaps %ymm0, (%rax) +vmovaps %ymm1, (%rcx) +vmovaps %ymm2, (%rdx) +vmovaps %ymm3, (%rbx) +# LLVM-MCA-END + +# CHECK: [0] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 * movb %spl, (%rax) +# CHECK-NEXT: 1 1 0.50 * movb %bpl, (%rcx) +# CHECK-NEXT: 1 1 0.50 * movb %sil, (%rdx) +# CHECK-NEXT: 1 1 0.50 * movb %dil, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb %spl, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb %bpl, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb %sil, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movb %dil, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. movb %spl, (%rax) +# CHECK-NEXT: [0,1] D=eER.. movb %bpl, (%rcx) +# CHECK-NEXT: [0,2] D==eER. movb %sil, (%rdx) +# CHECK-NEXT: [0,3] D===eER movb %dil, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb %spl, (%rax) +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movb %bpl, (%rcx) +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movb %sil, (%rdx) +# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movb %dil, (%rbx) + +# CHECK: [1] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 * movw %sp, (%rax) +# CHECK-NEXT: 1 1 0.50 * movw %bp, (%rcx) +# CHECK-NEXT: 1 1 0.50 * movw %si, (%rdx) +# CHECK-NEXT: 1 1 0.50 * movw %di, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw %sp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw %bp, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw %si, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movw %di, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. movw %sp, (%rax) +# CHECK-NEXT: [0,1] D=eER.. movw %bp, (%rcx) +# CHECK-NEXT: [0,2] D==eER. movw %si, (%rdx) +# CHECK-NEXT: [0,3] D===eER movw %di, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw %sp, (%rax) +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movw %bp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movw %si, (%rdx) +# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movw %di, (%rbx) + +# CHECK: [2] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 * movl %esp, (%rax) +# CHECK-NEXT: 1 1 0.50 * movl %ebp, (%rcx) +# CHECK-NEXT: 1 1 0.50 * movl %esi, (%rdx) +# CHECK-NEXT: 1 1 0.50 * movl %edi, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl %esp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl %ebp, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl %esi, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movl %edi, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. movl %esp, (%rax) +# CHECK-NEXT: [0,1] D=eER.. movl %ebp, (%rcx) +# CHECK-NEXT: [0,2] D==eER. movl %esi, (%rdx) +# CHECK-NEXT: [0,3] D===eER movl %edi, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl %esp, (%rax) +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movl %ebp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movl %esi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movl %edi, (%rbx) + +# CHECK: [3] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 * movq %rsp, (%rax) +# CHECK-NEXT: 1 1 0.50 * movq %rbp, (%rcx) +# CHECK-NEXT: 1 1 0.50 * movq %rsi, (%rdx) +# CHECK-NEXT: 1 1 0.50 * movq %rdi, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq %rsp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq %rbp, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq %rsi, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - movq %rdi, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. movq %rsp, (%rax) +# CHECK-NEXT: [0,1] D=eER.. movq %rbp, (%rcx) +# CHECK-NEXT: [0,2] D==eER. movq %rsi, (%rdx) +# CHECK-NEXT: [0,3] D===eER movq %rdi, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq %rsp, (%rax) +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movq %rbp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movq %rsi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movq %rdi, (%rbx) + +# CHECK: [4] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 803 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 1.00 * U movd %mm0, (%rax) +# CHECK-NEXT: 1 2 1.00 * U movd %mm1, (%rcx) +# CHECK-NEXT: 1 2 1.00 * U movd %mm2, (%rdx) +# CHECK-NEXT: 1 2 1.00 * U movd %mm3, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - 4.00 - 4.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movd %mm0, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movd %mm1, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movd %mm2, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movd %mm3, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: 0 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . movd %mm0, (%rax) +# CHECK-NEXT: [0,1] D==eeER . movd %mm1, (%rcx) +# CHECK-NEXT: [0,2] D====eeER . movd %mm2, (%rdx) +# CHECK-NEXT: [0,3] D======eeER movd %mm3, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) +# CHECK-NEXT: 1. 1 3.0 0.0 0.0 movd %mm1, (%rcx) +# CHECK-NEXT: 2. 1 5.0 0.0 0.0 movd %mm2, (%rdx) +# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx) + +# CHECK: [5] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 1.00 * movaps %xmm0, (%rax) +# CHECK-NEXT: 1 1 1.00 * movaps %xmm1, (%rcx) +# CHECK-NEXT: 1 1 1.00 * movaps %xmm2, (%rdx) +# CHECK-NEXT: 1 1 1.00 * movaps %xmm3, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - 4.00 - 4.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movaps %xmm0, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movaps %xmm1, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movaps %xmm2, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - movaps %xmm3, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. movaps %xmm0, (%rax) +# CHECK-NEXT: [0,1] D=eER.. movaps %xmm1, (%rcx) +# CHECK-NEXT: [0,2] D==eER. movaps %xmm2, (%rdx) +# CHECK-NEXT: [0,3] D===eER movaps %xmm3, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps %xmm0, (%rax) +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movaps %xmm1, (%rcx) +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movaps %xmm2, (%rdx) +# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movaps %xmm3, (%rbx) + +# CHECK: [6] Code Region + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 1600 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 3.97 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 4 1 1.00 * vmovaps %ymm0, (%rax) +# CHECK-NEXT: 4 1 1.00 * vmovaps %ymm1, (%rcx) +# CHECK-NEXT: 4 1 1.00 * vmovaps %ymm2, (%rdx) +# CHECK-NEXT: 4 1 1.00 * vmovaps %ymm3, (%rbx) + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - PdAGLU01 +# CHECK-NEXT: [0.1] - PdAGLU01 +# CHECK-NEXT: [1] - PdBranch +# CHECK-NEXT: [2] - PdCount +# CHECK-NEXT: [3] - PdDiv +# CHECK-NEXT: [4] - PdEX0 +# CHECK-NEXT: [5] - PdEX1 +# CHECK-NEXT: [6] - PdFPCVT +# CHECK-NEXT: [7.0] - PdFPFMA +# CHECK-NEXT: [7.1] - PdFPFMA +# CHECK-NEXT: [8.0] - PdFPMAL +# CHECK-NEXT: [8.1] - PdFPMAL +# CHECK-NEXT: [9] - PdFPMMA +# CHECK-NEXT: [10] - PdFPSTO +# CHECK-NEXT: [11] - PdFPU0 +# CHECK-NEXT: [12] - PdFPU1 +# CHECK-NEXT: [13] - PdFPU2 +# CHECK-NEXT: [14] - PdFPU3 +# CHECK-NEXT: [15] - PdFPXBR +# CHECK-NEXT: [16] - PdMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] +# CHECK-NEXT: - 4.00 - - - - - - - - - - - 4.00 - 4.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - vmovaps %ymm0, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - vmovaps %ymm1, (%rcx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - vmovaps %ymm2, (%rdx) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - 1.00 - - - - vmovaps %ymm3, (%rbx) + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. vmovaps %ymm0, (%rax) +# CHECK-NEXT: [0,1] .DeER.. vmovaps %ymm1, (%rcx) +# CHECK-NEXT: [0,2] . DeER. vmovaps %ymm2, (%rdx) +# CHECK-NEXT: [0,3] . DeER vmovaps %ymm3, (%rbx) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps %ymm0, (%rax) +# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vmovaps %ymm1, (%rcx) +# CHECK-NEXT: 2. 1 1.0 0.0 0.0 vmovaps %ymm2, (%rdx) +# CHECK-NEXT: 3. 1 1.0 0.0 0.0 vmovaps %ymm3, (%rbx) -- GitLab From af1ff1aef51f383f412fabc450a03d3537458e7e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 8 Nov 2018 15:17:10 +0000 Subject: [PATCH 0013/1581] [docs] Clarify expectations for stack map sections and AOT compilers git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346405 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/Statepoints.rst | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst index 6ed4e46b73b..d51a862ca72 100644 --- a/docs/Statepoints.rst +++ b/docs/Statepoints.rst @@ -590,8 +590,15 @@ Stack Map Format ================ Locations for each pointer value which may need read and/or updated by -the runtime or collector are provided via the :ref:`Stack Map format -` specified in the PatchPoint documentation. +the runtime or collector are provided in a separate section of the +generated object file as specified specified in the PatchPoint +documentation. This special section is encoded per the +:ref:`Stack Map format `. + +The general expectation is that a JIT compiler will parse and discard this +format; it is not particularly memory efficient. If you need an alternate +format (e.g. for an ahead of time compiler), see discussion under +:ref: `open work items ` below. Each statepoint generates the following Locations: @@ -831,7 +838,9 @@ Supported Architectures ======================= Support for statepoint generation requires some code for each backend. -Today, only X86_64 is supported. +Today, only X86_64 is supported. + +.. _OpenWork: Problem Areas and Active Work ============================= @@ -861,6 +870,16 @@ Problem Areas and Active Work `_ for more detail. +#. Support for alternate stackmap formats. For some use cases, it is + desirable to directly encode a final memory efficient stackmap format for + use by the runtime. This is particularly relevant for ahead of time + compilers which wish to directly link object files without the need for + post processing of each individual object file. While not implemented + today for statepoints, there is precedent for a GCStrategy to be able to + select a customer GCMetataPrinter for this purpose. Patches to enable + this functionality upstream are welcome. + + Bugs and Enhancements ===================== -- GitLab From 6355fcebb37990836bb46dfb0b1dcd751e98782d Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Thu, 8 Nov 2018 15:29:48 +0000 Subject: [PATCH 0014/1581] [SystemZ] Bugfix in shouldCoalesce() It was discovered in randomized testing that the SystemZ implementation of shouldCoalesce() could be caused to crash when subreg liveness was enabled. This was because an undef use of the virtual register was copied outside current MBB at the point of shouldCoalesce() being called. For more details, see https://bugs.llvm.org/show_bug.cgi?id=39276. This patch changes the check for MBB locality from livein/liveout checks to do checks for all instructions of both intervals being inside MBB. This avoids the cases with dead defs / undef uses outside MBB, which are not affecting liveness in/out of MBB. The original test case included as a reduced .mir test case. Review: Ulrich Weigand https://reviews.llvm.org/D54197 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346406 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/SystemZ/SystemZRegisterInfo.cpp | 25 ++++---- test/CodeGen/SystemZ/regalloc-GR128-02.mir | 68 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/SystemZ/regalloc-GR128-02.mir diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 76ed6f80ba5..23338dfec35 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -270,25 +270,30 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI, // Check that the two virtual registers are local to MBB. MachineBasicBlock *MBB = MI->getParent(); - if (LIS.isLiveInToMBB(IntGR128, MBB) || LIS.isLiveOutOfMBB(IntGR128, MBB) || - LIS.isLiveInToMBB(IntGRNar, MBB) || LIS.isLiveOutOfMBB(IntGRNar, MBB)) + MachineInstr *FirstMI_GR128 = + LIS.getInstructionFromIndex(IntGR128.beginIndex()); + MachineInstr *FirstMI_GRNar = + LIS.getInstructionFromIndex(IntGRNar.beginIndex()); + MachineInstr *LastMI_GR128 = LIS.getInstructionFromIndex(IntGR128.endIndex()); + MachineInstr *LastMI_GRNar = LIS.getInstructionFromIndex(IntGRNar.endIndex()); + if ((!FirstMI_GR128 || FirstMI_GR128->getParent() != MBB) || + (!FirstMI_GRNar || FirstMI_GRNar->getParent() != MBB) || + (!LastMI_GR128 || LastMI_GR128->getParent() != MBB) || + (!LastMI_GRNar || LastMI_GRNar->getParent() != MBB)) return false; - // Find the first and last MIs of the registers. - MachineInstr *FirstMI = nullptr, *LastMI = nullptr; + MachineBasicBlock::iterator MII = nullptr, MEE = nullptr; if (WideOpNo == 1) { - FirstMI = LIS.getInstructionFromIndex(IntGR128.beginIndex()); - LastMI = LIS.getInstructionFromIndex(IntGRNar.endIndex()); + MII = FirstMI_GR128; + MEE = LastMI_GRNar; } else { - FirstMI = LIS.getInstructionFromIndex(IntGRNar.beginIndex()); - LastMI = LIS.getInstructionFromIndex(IntGR128.endIndex()); + MII = FirstMI_GRNar; + MEE = LastMI_GR128; } - assert (FirstMI && LastMI && "No instruction from index?"); // Check if coalescing seems safe by finding the set of clobbered physreg // pairs in the region. BitVector PhysClobbered(getNumRegs()); - MachineBasicBlock::iterator MII = FirstMI, MEE = LastMI; MEE++; for (; MII != MEE; ++MII) { for (const MachineOperand &MO : MII->operands()) diff --git a/test/CodeGen/SystemZ/regalloc-GR128-02.mir b/test/CodeGen/SystemZ/regalloc-GR128-02.mir new file mode 100644 index 00000000000..65758bea4fd --- /dev/null +++ b/test/CodeGen/SystemZ/regalloc-GR128-02.mir @@ -0,0 +1,68 @@ +# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z13 \ +# RUN: -start-before=simple-register-coalescing -o - 2>&1 > /dev/null + +# Test that the SystemZ shouldCoalesce() implementation does not crash in +# case of an undef use in another MBB. This was discovered in testing with +# -systemz-subreg-liveness. + +--- | + @g_74 = external dso_local unnamed_addr global i32, align 4 + @g_193 = external dso_local unnamed_addr global i32, align 4 + + define dso_local void @main() local_unnamed_addr { + %1 = load i32, i32* @g_193 + %2 = or i32 %1, -1395153718 + %3 = sdiv i32 -1395153718, %2 + br i1 undef, label %5, label %4 + + ;